In [1]:
%load_ext autoreload
%autoreload 2

In [36]:
import eda

import numpy as np
from scipy import stats

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [26]:
# use eda module to load dataset 
df = eda.load_dataset(url="https://raw.github.com/ant1code/tweet-sentiment/master/data/train.csv")

# data preprocessing: removing na, mapping categorical variable to numerical
df = eda.preprocess_data(df)

# normalizing the tweets (removing unnecessary words, lemmatizing)
stop_words = eda.get_stop_words()
df = eda.normalize(df, stop_words)

In [22]:
# peek at dataset 
# recall that negative: 0, neutral: 1, positive: 2
df.head()

Unnamed: 0,text,sentiment,lemmatized
0,Spent the entire morning in a meeting w/ a ven...,1,spent entire morning meeting w vendor boss not...
1,Oh! Good idea about putting them on ice cream,2,oh good idea put ice cream
2,says good (or should i say bad?) afternoon! h...,1,say good say bad afternoon
3,i dont think you can vote anymore! i tried,0,dont think vote anymore try
4,haha better drunken tweeting you mean?,2,haha well drunken tweet mean


In [38]:
vectorizer = TfidfVectorizer()
bow = vectorizer.fit_transform(df['lemmatized'])
sentiment = df['sentiment']

In [39]:
len(vectorizer.get_feature_names())

21359

In [40]:
vectorizer = TfidfVectorizer(min_df=5)
bow = vectorizer.fit_transform(df['lemmatized'])
len(vectorizer.get_feature_names())

3749

In [41]:
bow

<27485x3749 sparse matrix of type '<class 'numpy.float64'>'
	with 166170 stored elements in Compressed Sparse Row format>

In [42]:
X_train, X_test, y_train, y_test = train_test_split(bow, sentiment, test_size=0.33)

In [43]:
classifier = RandomForestClassifier(random_state=42, bootstrap=False)
classifier.fit(X_train,y_train)
classifier.score(X_test,y_test)

0.7005842795722632

In [45]:
# BONUS: hyperparameter optimization

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 2000, stop = 4000, num = 200)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(100, 2000, num = 100)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [320, 640, 1280, 2560]
# Minimum number of samples required at each leaf node
min_samples_leaf = [16, 32]
# Method of selecting samples for training each tree
bootstrap = [False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

classifier = RandomForestClassifier()

random_search = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid,
                                   cv = 3, verbose = 2, random_state = 42, 
                                   n_iter=3, n_jobs= -1, return_train_score = True)

random_search.fit(X_train,y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  5.7min remaining:  7.2min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  7.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  7.8min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [46]:
# Take the best of the 6 trained models
random_search.best_params_

{'n_estimators': 3587,
 'min_samples_split': 640,
 'min_samples_leaf': 16,
 'max_features': 'sqrt',
 'max_depth': 1558,
 'bootstrap': False}