In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import words
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
nltk.download('punkt')
nltk.download('stopwords')
%matplotlib inline

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\robert\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\robert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
X_train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

In [3]:
print(X_train.info())
print(X_train.shape)
print(X_train.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   label   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB
None
(25000, 2)
                                                text  label
0  bromwell high is a cartoon comedy it ran at th...      1
1  homelessness or houselessness as george carlin...      1
2  brilliant over acting by lesley ann warren bes...      1
3  this is easily the most underrated film inn th...      1
4  this is not the typical mel brooks film it was...      1


In [4]:
def lemmatise(text):

    wn = nltk.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    word_filter = [token for token in tokens if token not in stopwords and token.isalpha()]
    lemma = [wn.lemmatize(token) for token in word_filter]

    return " ".join(lemma)

In [5]:
X_train['lemmas'] = X_train['text'].apply(lemmatise)
X_test['lemmas'] = X_test['text'].apply(lemmatise)

In [9]:
X_train['lemmas'].iloc[10]

'first read armistead maupins story taken human drama displayed gabriel one care love said given film version excellent story expected see past gloss hollywood writer armistead maupin director patrick stettner truly succeeded right amount restraint robin williams capture fragile essence gabriel let u see struggle issue trust personnel life jess world around donna introduced player drama reminded nothing ever seems smallest event change life irrevocably request review book written young man turn life changing event help gabriel find strength within carry move forward bad people avoid film say average american probably think robin williams serious role work please give movie chance robin williams touch darkness must find go better people like movie one hour photo stepped actor made another quality piece art oh forget believe bobby cannavale jess steal every scene leading man look screen presence hack opinion could carry movie right'

**TFIDF**

In [11]:
tv = TfidfVectorizer(
                    ngram_range = (1,3),
                    sublinear_tf = True,
                    max_features = 50000)

In [12]:
train_tv = tv.fit_transform(X_train['lemmas'])
test_tv = tv.transform(X_test['lemmas'])

In [13]:
vocab = tv.get_feature_names()
print(vocab[:5])

['aag', 'aames', 'aardman', 'aaron', 'ab']


In [14]:
print("Vocabulary length:", len(vocab))

Vocabulary length: 50000


In [15]:
dist = np.sum(train_tv, axis=0)
checking = pd.DataFrame(dist,columns = vocab)
checking

Unnamed: 0,aag,aames,aardman,aaron,ab,abandon,abandoned,abandoning,abandonment,abbas,...,zone episode,zoo,zoom,zorro,zp,zu,zu warrior,zucker,zulu,zuniga
0,1.658447,1.315939,1.545078,2.986594,2.38458,5.29966,13.912787,1.749735,1.9084,1.258187,...,1.837946,3.287039,4.929196,4.472657,0.817298,3.24479,1.325528,2.033842,1.547369,2.161156


In [16]:
print('Training dim:',train_tv.shape, 'Test dim:', test_tv.shape)

Training dim: (25000, 50000) Test dim: (25000, 50000)


In [20]:
train_vec = train_tv.toarray()
test_vec = test_tv.toarray() 

**RandomForest**

In [28]:
rfc = RandomForestClassifier(n_estimators = 100, max_features = "auto", random_state= 42 )
rfc = rfc.fit(train_vec, X_train['label'])
pred = rfc.predict(test_vec)

In [29]:
print("Accuracy:",metrics.accuracy_score(X_test['label'], pred))

Accuracy: 0.85444


In [30]:
rfc_gs = RandomForestClassifier(random_state = 42)

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth' : [4, 5, 6, 7, 8]
}

In [33]:
CV_rfc = GridSearchCV(estimator=rfc_gs, param_grid=param_grid, cv= 5)
CV_rfc.fit(train_vec, X_train['label'])

KeyboardInterrupt: 

In [None]:
CV_rfc.best_params_