In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
import nltk
from nltk.corpus import stopwords
import string
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

In [4]:
df = pd.read_csv('../Data/data.csv', encoding='latin-1')
df.rename(columns={'emotion_in_tweet_is_directed_at':'object', 'is_there_an_emotion_directed_at_a_brand_or_product':'sentiment'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  9092 non-null   object
 1   object      3291 non-null   object
 2   sentiment   9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


Remove row with missing text data.

In [5]:
df.dropna(thresh=2, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  9092 non-null   object
 1   object      3291 non-null   object
 2   sentiment   9092 non-null   object
dtypes: object(3)
memory usage: 284.1+ KB


In [6]:
df.head()

Unnamed: 0,tweet_text,object,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [7]:
df['sentiment'].value_counts()

No emotion toward brand or product    5388
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: sentiment, dtype: int64

In [8]:
df['sentiment'].replace("I can't tell", "Unknown", inplace=True)
df['sentiment'].replace("No emotion toward brand or product", "None", inplace=True)
df['sentiment'].replace(" emotion", "", inplace=True)
df['sentiment'].value_counts()

None                5388
Positive emotion    2978
Negative emotion     570
Unknown              156
Name: sentiment, dtype: int64

In [9]:
df.head()

Unnamed: 0,tweet_text,object,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [10]:
X = df[['tweet_text', 'object']]
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=18, stratify=y)

In [11]:
X_train.reset_index(inplace=True)
X_train.drop('index', axis=1, inplace=True)
X_test.reset_index(inplace=True)
X_test.drop('index', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Remove Stopwords

In [12]:
#Create List of stopwords & punctuation
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']

The '@' and '#' symbols carry special meaning in text on twitter and other social media platforms.  To include this meaning in the analysis, we'll remove these characters from the stopwords list.  The dataset also includes '{link}' in place of any actual url links.  We'll leave '{' and '}' in the text to reflect this.

In [13]:
stopwords_list.remove('#')
stopwords_list.remove('@')
stopwords_list.remove('{')
stopwords_list.remove('}')

In [14]:
def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    return stopwords_removed    

In [15]:
processed = list(map(remove_stopwords, X_train['tweet_text']))

In [16]:
joined_text_list = []
for word_list in processed:
    joined_text = ' '.join(word_list)
    joined_text_list.append(joined_text)
joined_text_list[:5]

['ipad2 3 weeks w ipad since gave @ mention # sxsw withdrawal',
 'rt @ mention first shots w/ipad 2 # sxsw { link }',
 "rt @ mention ning amp mobile roadie thrilled offer unofficial # sxsw insider 's guide iphone fun austin { link }",
 "rt @ mention bounced catch google 's marissa mayer speak always admired intelligent classy successful # sxsw",
 'part journalsim support democracy yes informed populous yes ipad focus support # newsapps # sxsw']

In [17]:
X_train['processed_text'] = joined_text_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6819 entries, 0 to 6818
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_text      6819 non-null   object
 1   object          2467 non-null   object
 2   processed_text  6819 non-null   object
dtypes: object(3)
memory usage: 159.9+ KB


In [19]:
X_train.head()

Unnamed: 0,tweet_text,object,processed_text
0,No ipad2 for me. Now I have 3 weeks w no iPad ...,,ipad2 3 weeks w ipad since gave @ mention # sx...
1,RT @mention First shots w/iPad 2 from #sxsw {l...,,rt @ mention first shots w/ipad 2 # sxsw { link }
2,RT @mention Ning &amp; Mobile Roadie are thril...,,rt @ mention ning amp mobile roadie thrilled o...
3,RT @mention Bounced over to catch Google's Mar...,,rt @ mention bounced catch google 's marissa m...
4,Part of Journalsim is the support of democracy...,iPad,part journalsim support democracy yes informed...


## TF-IDF Vectorization

In [20]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train['processed_text'])
tf_idf_data_train.shape

(6819, 8530)

6.8k tweets with 8.5k unique words in the vocabulary.

## RF Classifier

In [21]:
#Create pipeline
pipe_forest = Pipeline([('forest', RandomForestClassifier(random_state=70, n_jobs=-1, bootstrap=True))])

In [25]:
# Create the grid parameter
grid_forest = [{'forest__n_estimators': [100, 200, 300],
             'forest__max_depth': [1, 5, 15, 25, 50],
             'forest__min_samples_split': [2, 5, 10, 25, 50], 
             'forest__min_samples_leaf': [1, 3, 5, 10, 25], 
             'forest__criterion': ['gini', 'entropy'],
             'forest__max_features': ['auto', 'sqrt', 'log2'],
             'forest__max_samples': [None, .2, .5, .8]
             }]

# Create the grid, with "pipe" as the estimator
gridsearch_forest = RandomizedSearchCV(estimator=pipe_forest, 
                          param_distributions=grid_forest, 
                          return_train_score=True, #Include training results in cv_results
                          cv=5, #Use 5 folds in CV process
                          n_iter=500, #Try 500 hyperparameter combinations
                          n_jobs=-1, #Use paralell computing
                          verbose=8) #Give updates on progress during fitting

In [26]:
gridsearch_forest.fit(tf_idf_data_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   37.7s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   53.3s
[Parallel(n_jobs=-1)]: Done 205 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 529 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 745 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 866 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 1136 tasks      | elapsed:  

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('forest',
                                              RandomForestClassifier(n_jobs=-1,
                                                                     random_state=70))]),
                   n_iter=500, n_jobs=-1,
                   param_distributions=[{'forest__criterion': ['gini',
                                                               'entropy'],
                                         'forest__max_depth': [1, 5, 15, 25,
                                                               50],
                                         'forest__max_features': ['auto',
                                                                  'sqrt',
                                                                  'log2'],
                                         'forest__max_samples': [None, 0.2, 0.5,
                                                                 0.8],
                                         

In [27]:
gridsearch_forest.best_params_

{'forest__n_estimators': 200,
 'forest__min_samples_split': 10,
 'forest__min_samples_leaf': 1,
 'forest__max_samples': None,
 'forest__max_features': 'auto',
 'forest__max_depth': 50,
 'forest__criterion': 'gini'}

In [28]:
gridsearch_forest_df = pd.DataFrame.from_dict(gridsearch_forest.cv_results_)
best_models = gridsearch_forest_df.loc[gridsearch_forest_df['rank_test_score'] < 6]
best_models

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__n_estimators,param_forest__min_samples_split,param_forest__min_samples_leaf,param_forest__max_samples,param_forest__max_features,param_forest__max_depth,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
93,5.20817,0.196211,0.147405,0.048705,200,25,1,0.8,sqrt,50,...,0.634698,0.007771,2,0.710174,0.711641,0.707241,0.710907,0.701979,0.708389,0.003536
333,2.351125,0.147367,0.10814,0.001314,100,10,3,0.8,auto,50,...,0.632644,0.004954,3,0.676077,0.673877,0.669661,0.670761,0.669172,0.671909,0.00265
416,7.955804,0.655432,0.208804,0.001331,300,50,1,0.8,auto,50,...,0.63235,0.006633,5,0.694225,0.694959,0.693126,0.698808,0.690982,0.69442,0.002572
486,7.74725,0.17796,0.216324,0.005877,300,50,1,0.8,sqrt,50,...,0.63235,0.006633,5,0.694225,0.694959,0.693126,0.698808,0.690982,0.69442,0.002572
492,7.464928,0.134755,0.173347,0.052346,200,10,1,,auto,50,...,0.64027,0.006793,1,0.743355,0.737672,0.738955,0.741155,0.733871,0.739002,0.003215
494,6.230466,0.274357,0.129779,0.043996,200,50,1,0.8,auto,50,...,0.632351,0.006578,4,0.698625,0.696792,0.693309,0.696792,0.692815,0.695667,0.002235
