In [137]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [138]:
train = pd.read_csv('train.csv')

train_original=train.copy()

In [139]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [140]:
train

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954
...,...,...,...
15814,1,RT @ezlusztig: They took down the material on ...,22001
15815,2,RT @washingtonpost: How climate change could b...,17856
15816,0,notiven: RT: nytimesworld :What does Trump act...,384248
15817,-1,RT @sara8smiles: Hey liberals the climate chan...,819732


In [141]:
test = pd.read_csv('test.csv')

test_original=test.copy()

In [142]:
combine = train.append(test,ignore_index=True,sort=True)

In [143]:
combine.sentiment.value_counts()

 1.0    8530
 2.0    3640
 0.0    2353
-1.0    1296
Name: sentiment, dtype: int64

In [144]:
combine

Unnamed: 0,message,sentiment,tweetid
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221
1,It's not like we lack evidence of anthropogeni...,1.0,126103
2,RT @RawStory: Researchers say we have three ye...,2.0,698562
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954
...,...,...,...
26360,"RT @BrittanyBohrer: Brb, writing a poem about ...",,895714
26361,2016: the year climate change came home: Durin...,,875167
26362,RT @loop_vanuatu: Pacific countries positive a...,,78329
26363,"RT @xanria_00018: You’re so hot, you must be t...",,867455


In [145]:
combine.head()

Unnamed: 0,message,sentiment,tweetid
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221
1,It's not like we lack evidence of anthropogeni...,1.0,126103
2,RT @RawStory: Researchers say we have three ye...,2.0,698562
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954


### Remove capitalization and make everything lower case

In [83]:
combine['Tidy_Tweets'] = combine['message'].str.lower()

### remove URLs

In [84]:
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].str.replace('http\S+|www.\S+', '', case=False)

### Removing Stop Words

In [85]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

combine['Tidy_Tweets'] = combine['Tidy_Tweets'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
combine.head()

Unnamed: 0,message,sentiment,tweetid,Tidy_Tweets
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221,polyscimajor epa chief think carbon dioxide ma...
1,It's not like we lack evidence of anthropogeni...,1.0,126103,like lack evidence anthropogenic global warming
2,RT @RawStory: Researchers say we have three ye...,2.0,698562,rt @rawstory: researchers say three years act ...
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736,#todayinmaker# wired : 2016 pivotal year war c...
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954,"rt @soynoviodetodas: 2016, racist, sexist, cli..."


### Removing Punctuation, Numbers, and Special Characters

In [86]:
import string
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])

combine['Tidy_Tweets'] = combine['Tidy_Tweets'].apply(remove_punctuation_numbers)

In [87]:
combine.head()

Unnamed: 0,message,sentiment,tweetid,Tidy_Tweets
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221,polyscimajor epa chief think carbon dioxide ma...
1,It's not like we lack evidence of anthropogeni...,1.0,126103,like lack evidence anthropogenic global warming
2,RT @RawStory: Researchers say we have three ye...,2.0,698562,rt rawstory researchers say three years act cl...
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736,todayinmaker wired pivotal year war climate ...
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954,rt soynoviodetodas racist sexist climate chan...


### Remove Emojis

In [88]:
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].str.replace(r'[^\x00-\x7F]+', '', regex=True)

In [89]:
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [90]:
for x in combine['Tidy_Tweets'][100:300]:
    print(x)

director climate change denier coal lobbies deputy chief environment
irisrimon chinese broke massive iceberg antarctica part global warming hoax
thinkprogress tillersons climate change emails
mthgd global warming real club penguin shut
ndenicolamd doctors agree climate change making sick earthday marchforscience healthandclimate
tristinc president think climate change hoax made chinese vice president thinks curable disease
resevoirs methane resevoirs play substantial role global warming
fruitloopian snow march global warming
greenharvard universities uniquely important role play battle climate change
sethmacfarlane looking america officially believes climate change hoax sorry everybody else
popsci ugliest animals threatened climate change
realdonaldtrump potus fucking idiot geologist studied climate change college coauthored papers real
sensanders presidentelect believe climate change millions people going
someone explain march global warming
natgeochannel watch beforetheflood right he

### Tokenization

In [91]:
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

In [92]:
tokeniser = TreebankWordTokenizer()
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].apply(tokeniser.tokenize)

In [93]:
combine.head()

Unnamed: 0,message,sentiment,tweetid,Tidy_Tweets
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221,"[polyscimajor, chief, think, carbon, dioxide, ..."
1,It's not like we lack evidence of anthropogeni...,1.0,126103,"[like, lack, evidence, anthropogenic, global, ..."
2,RT @RawStory: Researchers say we have three ye...,2.0,698562,"[rawstory, researchers, three, years, climate,..."
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736,"[todayinmaker, wired, pivotal, year, climate, ..."
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954,"[soynoviodetodas, racist, sexist, climate, cha..."


### Lemmatizing

In [94]:
from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

In [95]:
def climate_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]  

In [96]:
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].apply(climate_lemma, args=(lemmatizer, ))

In [97]:
combine.head()

Unnamed: 0,message,sentiment,tweetid,Tidy_Tweets
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221,"[polyscimajor, chief, think, carbon, dioxide, ..."
1,It's not like we lack evidence of anthropogeni...,1.0,126103,"[like, lack, evidence, anthropogenic, global, ..."
2,RT @RawStory: Researchers say we have three ye...,2.0,698562,"[rawstory, researcher, three, year, climate, c..."
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736,"[todayinmaker, wired, pivotal, year, climate, ..."
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954,"[soynoviodetodas, racist, sexist, climate, cha..."


In [98]:
combine['Tidy_Tweets'] = combine['Tidy_Tweets'].apply(' '.join)
combine.head()

Unnamed: 0,message,sentiment,tweetid,Tidy_Tweets
0,PolySciMajor EPA chief doesn't think carbon di...,1.0,625221,polyscimajor chief think carbon dioxide main c...
1,It's not like we lack evidence of anthropogeni...,1.0,126103,like lack evidence anthropogenic global warming
2,RT @RawStory: Researchers say we have three ye...,2.0,698562,rawstory researcher three year climate change ...
3,#TodayinMaker# WIRED : 2016 was a pivotal year...,1.0,573736,todayinmaker wired pivotal year climate change
4,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",1.0,466954,soynoviodetodas racist sexist climate change d...


## Extracting Features from cleaned Tweets

### Bag-of-Words Features

In [99]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df=1.0, min_df=2, max_features=1000, stop_words='english')

# bag-of-words feature matrix
bow = bow_vectorizer.fit_transform(combine['Tidy_Tweets'])

df_bow = pd.DataFrame(bow.todense())

df_bow

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26361,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
26362,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### TF-IDF Features

In [100]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_df=0.70, min_df=2,max_features=1000,stop_words='english')

tfidf_matrix=tfidf.fit_transform(combine['Tidy_Tweets'])

df_tfidf = pd.DataFrame(tfidf_matrix.todense())

df_tfidf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.437471,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.534234,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
26361,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.627163,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
26362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
26363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.768585,0.0,0.0


## Splitting our dataset into Training and Validation Set

### Using the features from Bag-of-Words for training set

In [101]:
train_bow = bow[:15819]

train_bow.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Using features from TF-IDF for training set

In [102]:
train_tfidf_matrix = tfidf_matrix[:15819]

train_tfidf_matrix.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

### Splitting the data into training and validation set

In [103]:
from sklearn.model_selection import train_test_split

#### Bag-of-Words Features

In [104]:
x_train_bow, x_valid_bow, y_train_bow, y_valid_bow = train_test_split(train_bow,train['sentiment'],
                                                                      test_size=0.3,
                                                                      random_state=11)

#### TF-IDF features

In [105]:
x_train_tfidf, x_valid_tfidf, y_train_tfidf, y_valid_tfidf = train_test_split(train_tfidf_matrix,train['sentiment'],
                                                                              test_size=0.3,
                                                                              random_state=11)

## Applying Machine Learning Models

In [106]:
### Importing f1_score from sklearn
from sklearn.metrics import f1_score

### Logistic Regression

In [107]:
from sklearn.linear_model import LogisticRegression
Log_Reg = LogisticRegression(random_state=11,solver='lbfgs', max_iter=400)

### Bag-of-Words Features
Fitting the Logistic Regression Model.

In [108]:
Log_Reg.fit(x_train_bow,y_train_bow,)

LogisticRegression(max_iter=400, random_state=11)

### Predicting the probabilities.

In [109]:
prediction_bow = Log_Reg.predict(x_valid_bow)

prediction_bow

array([ 1,  0,  2, ..., -1,  1,  2], dtype=int64)

### Calculating the F1 score

In [110]:
log_bow = f1_score(y_valid_bow, prediction_bow, average='macro')
log_bow

0.5904303429630383

### TF-IDF Features
Fitting the Logistic Regression Model.

In [111]:
Log_Reg.fit(x_train_tfidf,y_train_tfidf)

LogisticRegression(max_iter=400, random_state=11)

#### Predicting the probabilities.

In [112]:
prediction_tfidf = Log_Reg.predict(x_valid_tfidf)

prediction_tfidf

array([1, 0, 2, ..., 1, 1, 2], dtype=int64)

### Calculate F1-score

In [113]:
log_tfidf = f1_score(y_valid_tfidf, prediction_tfidf, average='macro')
log_tfidf

0.5759683009107299

## XGBoost

In [114]:
from xgboost import XGBClassifier

### Bag-of-Words Features

In [115]:
model_bow = XGBClassifier(random_state=22,learning_rate=0.9)

In [116]:
##Fitting the XGBoost Model
model_bow.fit(x_train_bow, y_train_bow)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.9, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=22, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Predicting the probabilities.

In [117]:
xgb = model_bow.predict(x_valid_bow)

xgb

array([ 1,  1,  1, ..., -1,  1,  2], dtype=int64)

Calculating the F1 Score

In [118]:
xgb_bow=f1_score(y_valid_bow,xgb, average='macro')
xgb_bow

0.5998370725550446

### TDIF Features

In [119]:
model_tfidf = XGBClassifier()

In [120]:
#Fitting the XGBoost model
model_tfidf.fit(x_train_tfidf, y_train_tfidf)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [121]:
##Predicting the probabilities.
xgb_tfidf=model_tfidf.predict(x_valid_tfidf)

xgb_tfidf

array([1, 1, 1, ..., 1, 1, 2], dtype=int64)

In [122]:
# Calculating the F1 Score
score=f1_score(y_valid_tfidf,xgb_tfidf, average='macro')

score

0.5671245793108947

## Decision Trees

In [123]:
from sklearn.tree import DecisionTreeClassifier
dct = DecisionTreeClassifier(criterion='entropy', random_state=11)

### Bag-of-Words Features

In [124]:
#Fitting the Decision Tree model.
dct.fit(x_train_bow,y_train_bow)

DecisionTreeClassifier(criterion='entropy', random_state=11)

In [125]:
dct_bow = dct.predict(x_valid_bow)

dct_bow

array([ 1,  1,  2, ..., -1,  1,  2], dtype=int64)

In [126]:
# calculating f1 score
dct_score_bow=f1_score(y_valid_bow,dct_bow, average='macro')

dct_score_bow

0.5311575880098476

## TF-IDF Features

In [127]:
## Fitting the Decision Tree model
dct.fit(x_train_tfidf,y_train_tfidf)

DecisionTreeClassifier(criterion='entropy', random_state=11)

In [128]:
dct_tfidf = dct.predict(x_valid_tfidf)

dct_tfidf

array([ 1,  1,  0, ..., -1,  1,  2], dtype=int64)

In [129]:
### Calculating the F1 Score
dct_score_tfidf=f1_score(y_valid_tfidf,dct_tfidf, average ='macro')

dct_score_tfidf

0.5291142125450711

## Random Forest

In [130]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [131]:
rfc = RandomForestClassifier(n_estimators=800, max_depth=50,max_features='auto')
# Bag of words
rfc.fit(x_train_bow,y_train_bow)


RandomForestClassifier(max_depth=50, n_estimators=800)

In [132]:
rfc_prediction_bow = rfc.predict(x_valid_bow)

rfc_prediction_bow

array([1, 1, 1, ..., 1, 1, 2], dtype=int64)

In [133]:
log_bow = f1_score(y_valid_bow, rfc_prediction_bow, average ='macro')

log_bow

0.5305556761363619

# Model Comparison

## Bag-of-Words

In [134]:
Algo_1 = ['LogisticRegression(Bag-of-Words)','XGBoost(Bag-of-Words)','DecisionTree(Bag-of-Words)']

score_1 = [log_bow,xgb_bow,dct_score_bow]

compare_1 = pd.DataFrame({'Model':Algo_1,'F1_Score':score_1},index=[i for i in range(1,4)])

compare_1.T

Unnamed: 0,1,2,3
Model,LogisticRegression(Bag-of-Words),XGBoost(Bag-of-Words),DecisionTree(Bag-of-Words)
F1_Score,0.530556,0.599837,0.531158


## TF-IDF

In [135]:
Algo_2 = ['LogisticRegression(TF-IDF)','XGBoost(TF-IDF)','DecisionTree(TF-IDF)']

score_2 = [log_tfidf,score,dct_score_tfidf]

compare_2 = pd.DataFrame({'Model':Algo_2,'F1_Score':score_2},index=[i for i in range(1,4)])

compare_2.T

Unnamed: 0,1,2,3
Model,LogisticRegression(TF-IDF),XGBoost(TF-IDF),DecisionTree(TF-IDF)
F1_Score,0.575968,0.567125,0.529114


# Predicting the results for our test data

In [136]:
test_tfidf = bow[15819:]
test_pred = model_bow.predict(test_tfidf)
test['sentiment'] = test_pred
submission = test[['tweetid','sentiment']]
submission.to_csv('result_bow.csv', index=False)