In [0]:
# importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
from collections import Counter

In [0]:
# loading the dataset and dropping one column, since we only need the text and sentiment
dataset = pd.read_csv('train.csv')
df = pd.DataFrame(data=dataset, columns=['text', 'sentiment'])
df = df.dropna()
#creating word count, uppercase and special char. word count
import string
df['Word Count'] = [len(review.split()) for review in df['text']]

df['Uppercase Char Count'] = [sum(char.isupper() for char in review) \
                              for review in df['text']]                           

df['Special Char Count'] = [sum(char in string.punctuation for char in review) \
                            for review in df['text']]   
# Formatting the datatypes
df['sentiment'] = df['sentiment'].map({'positive': 2, 'neutral': 1, 'negative': 0})

In [7]:
df.head(10)

Unnamed: 0,text,sentiment,Word Count,Uppercase Char Count,Special Char Count
0,Spent the entire morning in a meeting w/ a ven...,1,28,3,5
1,Oh! Good idea about putting them on ice cream,2,9,2,1
2,says good (or should i say bad?) afternoon! h...,1,9,0,10
3,i dont think you can vote anymore! i tried,0,9,0,1
4,haha better drunken tweeting you mean?,2,6,0,1
5,headache wanna see my Julie,0,5,1,0
6,had an awsome salad! I recommend getting the S...,2,12,2,2
7,fine! Going to do my big walk today 20 or so ...,2,12,1,1
8,Thank a yoou how are you? #TwitterTakeover,2,7,3,2
9,Why don't adobe realise no one WANTS to pay fo...,1,23,7,1


In [0]:
# here should be shown that the descriptive statistcis (mean, median wordcount etc.) 
# are the same for all three groups.

In [9]:
df['text']

0        Spent the entire morning in a meeting w/ a ven...
1            Oh! Good idea about putting them on ice cream
2        says good (or should i say bad?) afternoon!  h...
3               i dont think you can vote anymore! i tried
4                   haha better drunken tweeting you mean?
                               ...                        
27481    I want to go to VP, but no one is willing to c...
27482                                Wah, why are you sad?
27483    playing sudoku while mommy makes me breakfast ...
27484                   see u bye see u!  i love the hot30
27485           ha ha, and what game is that? i like games
Name: text, Length: 27485, dtype: object

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['text'].apply(lambda x: [item for item in x if item not in stop])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0        [S, p, e, n,  , h, e,  , e, n, r, e,  , r, n, ...
1        [ , O, h, !,  , G,  , e,  , b, u,  , p, u, n, ...
2        [ , g,  , (, r,  , h, u, l,  ,  ,  , b, ?, ), ...
3        [ ,  , n,  , h, n, k,  , u,  , c, n,  , v, e, ...
4        [ , h, h,  , b, e, e, r,  , r, u, n, k, e, n, ...
                               ...                        
27481    [I,  , w, n,  ,  , g,  ,  , V, P, ,,  , b, u, ...
27482           [ , W, h, ,,  , w, h,  , r, e,  , u,  , ?]
27483    [p, l, n, g,  , u, k, u,  , w, h, l, e,  ,  , ...
27484    [ , e, e,  , u,  , b, e,  , e, e,  , u, !,  , ...
27485    [ ,  , h,  , h, ,,  , n,  , w, h,  , g, e,  , ...
Name: text, Length: 27485, dtype: object

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
bow = vectorizer.fit_transform(df['text'])
sentiment = df['sentiment']

In [12]:
len(vectorizer.get_feature_names())

26447

In [13]:
vectorizer = TfidfVectorizer(min_df=5)
bow = vectorizer.fit_transform(df['text'])
len(vectorizer.get_feature_names())

4454

In [0]:
# Useless feature, don't run this!
# Only getting about 0.4 accuracy with this opption turned on.
# from sklearn.feature_selection import SelectKBest, chi2
# select x features that have the strongest correlation to a class from the
# remaining y features.
# selected_features = \
# SelectKBest(chi2, k=3000).fit(bow, sentiment).get_support(indices=True)

In [15]:
vectorizer = TfidfVectorizer(min_df=5)
bow = vectorizer.fit_transform(df['text'])
bow

<27485x4454 sparse matrix of type '<class 'numpy.float64'>'
	with 285423 stored elements in Compressed Sparse Row format>

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(bow, sentiment, test_size=0.33)

In [58]:
# Training the classifier
from sklearn.ensemble import RandomForestClassifier as rfc
classifier = rfc(random_state=42, bootstrap=False)
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

0.6783155109690222

In [41]:
# BONUS: hyperparameter optimization
from sklearn.model_selection import RandomizedSearchCV 
from scipy import stats

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 2000, stop = 4000, num = 200)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(100, 2000, num = 100)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [320, 640, 1280, 2560]
# Minimum number of samples required at each leaf node
min_samples_leaf = [16, 32]
# Method of selecting samples for training each tree
bootstrap = [False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

classifier = rfc()

random_search = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid,
                                   cv = 3, verbose = 2, random_state = 42, 
                                   n_iter=3, n_jobs= -1, return_train_score = True)

random_search.fit(X_train,y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 11.9min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [42]:
# Take the best of the 6 trained models
random_search.best_params_

{'bootstrap': False,
 'max_depth': 1558,
 'max_features': 'sqrt',
 'min_samples_leaf': 16,
 'min_samples_split': 640,
 'n_estimators': 3587}