## Library imports

In [1]:
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spacy
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS

from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
SEED = 42
document_tweets = []
nlp = spacy.load('en_core_web_sm')

### Loading datasets

In [49]:
train_tweets = pd.read_csv('climate-change-belief-analysis/train.csv')
test_tweets = pd.read_csv('climate-change-belief-analysis/test.csv')


In [4]:
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [5]:
train_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


In [6]:
train_tweets['sentiment'].value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [7]:
test_tweets.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [8]:
test_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10546 entries, 0 to 10545
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  10546 non-null  object
 1   tweetid  10546 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 164.9+ KB


## Calculating basic metrics

### Word count

In [9]:
train_tweets['word_count'] = train_tweets['message'].apply(lambda tweet: len(tweet.split()))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,19
1,1,It's not like we lack evidence of anthropogeni...,126103,10
2,2,RT @RawStory: Researchers say we have three ye...,698562,19
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,15
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,18


### Character count

In [10]:
train_tweets['char_count'] = train_tweets['message'].apply(lambda tweet: sum([len(word) for word in tweet.split()]))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,19,122
1,1,It's not like we lack evidence of anthropogeni...,126103,10,53
2,2,RT @RawStory: Researchers say we have three ye...,698562,19,122
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,15,85
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,18,105


### Average word length

In [11]:
train_tweets['average_word_length'] = train_tweets['message'].apply(lambda tweet: sum([len(word) for word in tweet.split()]) / len(tweet.split()))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,19,122,6.421053
1,1,It's not like we lack evidence of anthropogeni...,126103,10,53,5.3
2,2,RT @RawStory: Researchers say we have three ye...,698562,19,122,6.421053
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,15,85,5.666667
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,18,105,5.833333


### Stopword count

In [12]:
train_tweets['stopword_count'] = train_tweets['message'].apply(lambda tweet: len([word for word in tweet.split() if word in STOP_WORDS]))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,19,122,6.421053,3
1,1,It's not like we lack evidence of anthropogeni...,126103,10,53,5.3,3
2,2,RT @RawStory: Researchers say we have three ye...,698562,19,122,6.421053,8
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,15,85,5.666667,5
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,18,105,5.833333,5


### #Hashtag and @Mention counts

In [13]:
train_tweets['hastag_count'] = train_tweets['message'].apply(lambda tweet: len([word for word in tweet.split() if word.startswith('#')]))
train_tweets['mention_count'] = train_tweets['message'].apply(lambda tweet: len([word for word in tweet.split() if word.startswith('@')]))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,19,122,6.421053,3,0,1
1,1,It's not like we lack evidence of anthropogeni...,126103,10,53,5.3,3,0,0
2,2,RT @RawStory: Researchers say we have three ye...,698562,19,122,6.421053,8,0,1
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,15,85,5.666667,5,1,0
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,18,105,5.833333,5,1,1


## Preprocessing and data cleaning

### Lower case conversion

In [14]:
train_tweets['message'] = train_tweets['message'].apply(lambda tweet: tweet.lower())
test_tweets['message'] = test_tweets['message'].apply(lambda tweet: tweet.lower())
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
0,1,polyscimajor epa chief doesn't think carbon di...,625221,19,122,6.421053,3,0,1
1,1,it's not like we lack evidence of anthropogeni...,126103,10,53,5.3,3,0,0
2,2,rt @rawstory: researchers say we have three ye...,698562,19,122,6.421053,8,0,1
3,1,#todayinmaker# wired : 2016 was a pivotal year...,573736,15,85,5.666667,5,1,0
4,1,"rt @soynoviodetodas: it's 2016, and a racist, ...",466954,18,105,5.833333,5,1,1


### Expand word contractions

### Email removal

In [15]:
train_tweets['message'] = train_tweets['message'].apply(lambda tweet: re.sub('[a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+', '', tweet))
test_tweets['message'] = test_tweets['message'].apply(lambda tweet: re.sub('[a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+', '', tweet))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
0,1,polyscimajor epa chief doesn't think carbon di...,625221,19,122,6.421053,3,0,1
1,1,it's not like we lack evidence of anthropogeni...,126103,10,53,5.3,3,0,0
2,2,rt @rawstory: researchers say we have three ye...,698562,19,122,6.421053,8,0,1
3,1,#todayinmaker# wired : 2016 was a pivotal year...,573736,15,85,5.666667,5,1,0
4,1,"rt @soynoviodetodas: it's 2016, and a racist, ...",466954,18,105,5.833333,5,1,1


### URL removal

In [16]:
train_tweets['message'] = train_tweets['message'].apply(lambda tweet: re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', tweet))
test_tweets['message'] = test_tweets['message'].apply(lambda tweet: re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', tweet))

### Remove RT and mentions

In [17]:
train_tweets['message'] = train_tweets['message'].apply(lambda tweet: re.sub('rt', '', tweet))
test_tweets['message'] = test_tweets['message'].apply(lambda tweet: re.sub('rt', '', tweet))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
0,1,polyscimajor epa chief doesn't think carbon di...,625221,19,122,6.421053,3,0,1
1,1,it's not like we lack evidence of anthropogeni...,126103,10,53,5.3,3,0,0
2,2,@rawstory: researchers say we have three year...,698562,19,122,6.421053,8,0,1
3,1,#todayinmaker# wired : 2016 was a pivotal year...,573736,15,85,5.666667,5,1,0
4,1,"@soynoviodetodas: it's 2016, and a racist, se...",466954,18,105,5.833333,5,1,1


In [18]:
# Remove @mentions

### Removal of punctuations and special characters

In [19]:
train_tweets['message'] = train_tweets['message'].apply(lambda tweet: re.sub('[^a-z A-Z 0-9-]', '', tweet))
test_tweets['message'] = test_tweets['message'].apply(lambda tweet: re.sub('[^a-z A-Z 0-9-]', '', tweet))

### Removal of digits

In [20]:
train_tweets['message'] = train_tweets['message'].apply(lambda tweet: re.sub('[\d]+', '', tweet))
test_tweets['message'] = test_tweets['message'].apply(lambda tweet: re.sub('[\d]+', '', tweet))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,19,122,6.421053,3,0,1
1,1,its not like we lack evidence of anthropogenic...,126103,10,53,5.3,3,0,0
2,2,rawstory researchers say we have three years ...,698562,19,122,6.421053,8,0,1
3,1,todayinmaker wired was a pivotal year in the...,573736,15,85,5.666667,5,1,0
4,1,soynoviodetodas its and a racist sexist clim...,466954,18,105,5.833333,5,1,1


### Remove multiple spaces

In [21]:
train_tweets['message'] = train_tweets['message'].apply(lambda tweet: ' '.join(tweet.split()))
test_tweets['message'] = test_tweets['message'].apply(lambda tweet: ' '.join(tweet.split()))

### Stop word removal

In [22]:
train_tweets['message'] = train_tweets['message'].apply(lambda tweet: ' '.join([word for word in tweet.split() if word not in STOP_WORDS]))
test_tweets['message'] = test_tweets['message'].apply(lambda tweet: ' '.join([word for word in tweet.split() if word not in STOP_WORDS]))
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,19,122,6.421053,3,0,1
1,1,like lack evidence anthropogenic global warming,126103,10,53,5.3,3,0,0
2,2,rawstory researchers years act climate change ...,698562,19,122,6.421053,8,0,1
3,1,todayinmaker wired pivotal year war climate ch...,573736,15,85,5.666667,5,1,0
4,1,soynoviodetodas racist sexist climate change d...,466954,18,105,5.833333,5,1,1


### Lemmatize Tweets

In [23]:


def lemmatize_tweet(tweet):
    lemmas = []
    doc = nlp(tweet)
    document_tweets.append(doc)
    for token in doc:
        lemma = str(token.lemma_)
        if token.lemma_ == '-PRON-' or token.lemma_ == 'be':
            lemma = token.text
        lemmas.append(lemma)
        
    return ' '.join(lemmas)

train_tweets['message'] = train_tweets['message'].apply(lemmatize_tweet)
test_tweets['message'] = test_tweets['message'].apply(lemmatize_tweet)
train_tweets.head()

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
0,1,polyscimajor epa chief do not think carbon dio...,625221,19,122,6.421053,3,0,1
1,1,like lack evidence anthropogenic global warming,126103,10,53,5.3,3,0,0
2,2,rawstory researcher years act climate change late,698562,19,122,6.421053,8,0,1
3,1,todayinmaker wire pivotal year war climate change,573736,15,85,5.666667,5,1,0
4,1,soynoviodetodas racist sexist climate change d...,466954,18,105,5.833333,5,1,1


## Balancing the classes

### Separating majority and minority classes

In [24]:
news = train_tweets[train_tweets['sentiment'] == 2]
neutral = train_tweets[train_tweets['sentiment'] == 0]
positive = train_tweets[train_tweets['sentiment'] == 1]
negative = train_tweets[train_tweets['sentiment'] == -1]

### Resampling minority classes

In [25]:
# Upscaling neutral sentinent relatyed tweets
neutral_upscaled = resample(neutral,
                        replace=True,
                        n_samples=len(positive),
                        random_state=SEED
                        )
neutral_upscaled

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
5656,0,melanie jeniferstevens latimes nation world ad...,709291,20,119,5.950000,8,0,3
8570,0,graysondolan eventually bc global warming-,189990,7,45,6.428571,2,0,1
7453,0,eliapaul weather love fuck global warming,324580,11,54,4.909091,2,0,1
7201,0,jamesjcowan break netanyahu declare climate ch...,599175,18,123,6.833333,5,6,1
10956,0,evanlsoloman craig oliver explain climate chan...,454966,19,113,5.947368,3,0,1
...,...,...,...,...,...,...,...,...,...
10637,0,question u like smoking weed hate pay taxis th...,837396,26,121,4.653846,8,0,0
3464,0,hot debate paris agreement climate change,868545,16,104,6.500000,3,0,0
15718,0,scientist increase soda carbonation worldwide ...,894388,19,111,5.842105,7,1,0
3967,0,man crush continue climate change hoax it will...,340502,19,98,5.157895,8,0,0


In [26]:
# Upscaling negative sentinent related tweets
negative_upscaled = resample(negative,
                        replace=True,
                        n_samples=len(positive),
                        random_state=SEED
                        )
negative_upscaled

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
13780,-1,lmao fool think global warming realit degree o...,603068,14,74,5.285714,2,0,0
10438,-1,ipcc look climate change protester copenhagen ...,90618,16,98,6.125000,7,1,0
15806,-1,herbermp snessness al gore idiot think climate...,213863,23,108,4.695652,8,0,2
13854,-1,tuckercarlson realdonaldtrump wrestling meme r...,748586,12,78,6.500000,3,1,2
13434,-1,realdonaldtrump concept global warming create ...,502164,21,118,5.619048,9,0,1
...,...,...,...,...,...,...,...,...,...
12394,-1,columbiabugle al gores current house great mak...,443648,18,112,6.222222,4,0,1
4772,-1,gmbnumba obama could have spend mil spend clim...,629228,21,114,5.428571,5,0,1
10154,-1,world global warming alarmist nation ignore gr...,93930,10,86,8.600000,0,0,0
13047,-1,ecosensenow put pressure climate change politicos,882251,11,84,7.636364,3,0,1


In [27]:
# Upscaling news sentinent related tweets
news_upscaled = resample(news,
                        replace=True,
                        n_samples=len(positive),
                        random_state=SEED
                        )
news_upscaled

Unnamed: 0,sentiment,message,tweetid,word_count,char_count,average_word_length,stopword_count,hastag_count,mention_count
13776,2,climatehawk europe face drought flood storm cl...,849649,14,110,7.857143,1,1,2
15247,2,el nino warm planet spark zika epidemic scient...,610545,19,109,5.736842,5,0,0
3533,2,cbcales trump say keep open mind climate chang...,625516,21,120,5.714286,3,0,1
5570,2,thehill sierra club call investigation epa hea...,86301,16,122,7.625000,3,0,1
4824,2,tackle climate change boost economic growth oe...,291460,10,76,7.600000,1,0,0
...,...,...,...,...,...,...,...,...,...
8384,2,wsj california governor jerry brown challenge ...,680465,13,111,8.538462,1,0,1
6671,2,theecoheroe prince charles co - author ladybir...,328961,13,117,9.000000,0,2,1
14045,2,country set cash climate change,659392,10,59,5.900000,3,0,0
9194,2,officialjoelf miami underwater climate change,260626,13,86,6.615385,5,0,1


### Combine upsampled minority classes with majority class 

In [28]:
upsampled = pd.concat([neutral_upscaled, negative_upscaled, news_upscaled, positive])

# Check new class counts
downsampled['sentiment'].value_counts()

-1    8530
 2    8530
 1    8530
 0    8530
Name: sentiment, dtype: int64

## Model building

In [50]:
from sklearn.pipeline import Pipeline

In [63]:
train_features, test_features, train_labels, test_labels = train_test_split(upsampled['message'], upsampled['sentiment'], test_size=0.20, random_state=SEED)

### Logistic Regression Classifier

In [64]:
lg_classifier = Pipeline([('tfidf', TfidfVectorizer()),
                          ('clf', LogisticRegression(multi_class='multinomial', solver='sag', penalty='l2', random_state=SEED)),])

In [65]:
lg_classifier.fit(train_features, train_labels)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [66]:
# Form a prediction set
lg_predictions = lg_classifier.predict(test_features)

In [67]:
#predictions for  kaggle test data
lg_test_predictions = lg_classifier.predict(test_tweets['message'])


In [68]:
test_tweets['sentiment'] = lg_test_predictions
test_tweets.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [69]:
test_tweets.to_csv(path_or_buf='lg_predictions.csv', columns=['tweetid', 'sentiment'], index=False, index_label=['tweetid', 'sentiment'])

In [70]:
# Report the confusion matrix
print(confusion_matrix(test_labels,lg_predictions))

[[  81   32  150   15]
 [   9  154  225   37]
 [   9   50 1539  157]
 [   1    6  140  559]]


In [71]:
# Print a classification report
print(classification_report(test_labels,lg_predictions))

              precision    recall  f1-score   support

          -1       0.81      0.29      0.43       278
           0       0.64      0.36      0.46       425
           1       0.75      0.88      0.81      1755
           2       0.73      0.79      0.76       706

    accuracy                           0.74      3164
   macro avg       0.73      0.58      0.61      3164
weighted avg       0.73      0.74      0.72      3164



In [72]:
# Print the overall accuracy
print(accuracy_score(test_labels,lg_predictions))

0.7373577749683944


### Logistic Regression CV

In [None]:
# lgcv_classifier = Pipeline([('tfidf', TfidfVectorizer()),
#                           ('clf', LogisticRegressionCV(max_iter=1000,
#                                                      penalty='l1',
#                                                      class_weight='balanced',
#                                                      solver='saga',
#                                                      multi_class='multinomial',
#                                                      random_state=SEED)),])

In [82]:
#lgcv_classifier.fit(train_features, train_labels)

In [None]:
# Form a prediction set
lgcv_predictions = lgcv_classifier.predict(test_features)

In [None]:
# Report the confusion matrix
print(confusion_matrix(test_labels,lgcv_predictions))

In [None]:
# Print a classification report
print(classification_report(test_labels,lgcv_predictions))

In [None]:
# Print the overall accuracy
print(accuracy_score(test_labels,lgcv_predictions))

### Multinomial Naive Bayes

In [None]:
# Naïve Bayes:
mnb_classifier = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB()),])

In [81]:
# train Naive bayes classifier
#mnb_classifier.fit(train_labels, train_labels)

In [None]:
# Form a prediction set
mnb_predictions = mnb_classifier.predict(test_features)

In [None]:
# Report the confusion matrix
print(metrics.confusion_matrix(test_labels,mnb_predictions))

In [None]:
# Print a classification report
print(metrics.classification_report(test_labels,mnb_predictions))

In [None]:
# Print the overall accuracy
print(metrics.accuracy_score(test_labels,mnb_predictions))

### SGD Classifier

### Linear SVC

In [73]:
# Linear SVC:
lsvc_classifier = Pipeline([('tfidf', TfidfVectorizer()),
                          
                          ('clf', LinearSVC(random_state=SEED)),])

In [74]:
lsvc_classifier.fit(train_features, train_labels)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [75]:
# Form a prediction set
lsvc_predictions = lsvc_classifier.predict(test_features)
lsvc_test_predictions = lsvc_classifier.predict(test_tweets['message'])

In [76]:
test_tweets.drop('sentiment', axis=1, inplace=True)
test_tweets.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [77]:
test_tweets['sentiment'] = lsvc_test_predictions
test_tweets.to_csv(path_or_buf='lsvc_predictions.csv', columns=['tweetid', 'sentiment'], index=False, index_label=['tweetid', 'sentiment'])
test_tweets.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [78]:
# Report the confusion matrix
print(confusion_matrix(test_labels, lsvc_predictions))

[[ 135   35   95   13]
 [  18  183  192   32]
 [  29   77 1496  153]
 [   7   10  118  571]]


In [79]:
# Print a classification report
print(classification_report(test_labels, lsvc_predictions))

              precision    recall  f1-score   support

          -1       0.71      0.49      0.58       278
           0       0.60      0.43      0.50       425
           1       0.79      0.85      0.82      1755
           2       0.74      0.81      0.77       706

    accuracy                           0.75      3164
   macro avg       0.71      0.64      0.67      3164
weighted avg       0.75      0.75      0.74      3164



In [80]:
# Print the overall accuracy
print(accuracy_score(test_labels, lsvc_predictions))

0.7537926675094817


### Decision Tree Classifier

### Random Forest Classifier

In [None]:
def get_polarity(n_grams):
    polarities = []
    #for n_gram in n_grams:
    return TextBlob(' '.join(n_grams)).sentiment.polarity
    #return sum(polarities) / len(polarities)

In [None]:
def get_sentivity(n_grams):
    sensitivities = []
    for n_gram in n_grams:
        sentivities.append(TextBlob(n_gram).sentiment.sensitivity())
    return sum(sentivities) / len(sentivities)