In [None]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import itertools, string, operator, re, unicodedata, nltk
from operator import itemgetter
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from bs4 import BeautifulSoup
import numpy as np
from itertools import combinations
from gensim.models import Phrases
from collections import Counter

#Contraction map
c_dict = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "i'd": "I would",
  "i'd've": "I would have",
  "i'll": "I will",
  "i'll've": "I will have",
  "i'm": "I am",
  "i've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))

add_stop = ['', ' ', 'say', 's', 'u', 'ap', 'afp', '....', 'n', '\\','rt',' ...', '... ','…','¢','â','¬','...','ã',',','¦']

stop_words = ENGLISH_STOP_WORDS.union(add_stop)

tokenizer = TweetTokenizer()
pattern = r"(?u)\b\w\w+\b" 

lemmatizer = WordNetLemmatizer()

punc = list(set(string.punctuation))

def casual_tokenizer(text): #Splits words on white spaces (leaves contractions intact) and splits out trailing punctuation
    tokens = tokenizer.tokenize(text)
    return tokens

#Function to replace the nltk pos tags with the corresponding wordnet pos tag to use the wordnet lemmatizer
def get_word_net_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
def lemma_wordnet(tagged_text):
    final = []
    for word, tag in tagged_text:
        wordnet_tag = get_word_net_pos(tag)
        if wordnet_tag is None:
            final.append(lemmatizer.lemmatize(word))
        else:
            final.append(lemmatizer.lemmatize(word, pos=wordnet_tag))
    return final

def expandContractions(text, c_re=c_re):
    def replace(match):
        return c_dict[match.group(0)]
    return c_re.sub(replace, text)

def remove_html(text):
    soup = BeautifulSoup(text, "html5lib")
    tags_del = soup.get_text()
    uni = unicodedata.normalize("NFKD", tags_del)
    bracket_del = re.sub(r'\[.*?\]', '  ', uni)
    apostrphe = re.sub('’', "'", bracket_del)
    string = apostrphe.replace('\r','  ')
    string = string.replace('\n','  ')
    extra_space = re.sub(' +',' ', string)
    return extra_space

def process_text(text):
    soup = BeautifulSoup(text, "lxml")
    tags_del = soup.get_text()
    no_html = re.sub('<[^>]*>', '', tags_del)
    tokenized = casual_tokenizer(no_html)
    lower = [item.lower() for item in tokenized]
    decontract = [expandContractions(item, c_re=c_re) for item in lower]
    tagged = nltk.pos_tag(decontract)
    lemma = lemma_wordnet(tagged)
    no_num = [re.sub('[0-9]+', '', each) for each in lemma]
    #no_punc = [w for w in no_num if w not in punc]
    #no_stop = [w for w in no_punc if w not in stop_words]
    return no_num

def word_count(text):
    return len(str(text).split(' '))

def word_freq(clean_text_list, top_n):
    """
    Word Frequency
    """
    flat = [item for sublist in clean_text_list for item in sublist]
    with_counts = Counter(flat)
    top = with_counts.most_common(top_n)
    word = [each[0] for each in top]
    num = [each[1] for each in top]
    return pd.DataFrame([word, num]).T

def word_freq_bigrams(clean_text_list, top_n):
    """
    Word Frequency With Bigrams
    """
    bigram_model = Phrases(clean_text_list, min_count=2, threshold=1)
    w_bigrams = list(bigram_model[clean_text_list])
    flat_w_bigrams = [item for sublist in w_bigrams for item in sublist]
    with_counts = Counter(flat_w_bigrams)
    top = with_counts.most_common(top_n)
    word = [each[0] for each in top]
    num = [each[1] for each in top]
    return pd.DataFrame([word, num]).T


def bigram_freq(clean_text_list, top_n):
    bigram_model = Phrases(clean_text_list, min_count=2, threshold=1)
    w_bigrams = list(bigram_model[clean_text_list])
    flat_w_bigrams = [item for sublist in w_bigrams for item in sublist]
    bigrams = []
    for each in flat_w_bigrams:
        if '_' in each:
            bigrams.append(each)
    counts = Counter(bigrams)
    top = counts.most_common(top_n)
    word = [each[0] for each in top]
    num = [each[1] for each in top]
    return pd.DataFrame([word, num]).T

In [None]:
import pandas as pd
import numpy as np
import json


'''Features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''Display'''
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/jonnybegreat/test-repo/master/twitter_train.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/jonnybegreat/test-repo/master/twitter_test.csv')

In [None]:
df.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [None]:
#Apply the function to preprocess the text. Tokenize, lower, expand contactions, lemmatize, remove punctuation, numbers and stop words
df['clean_text'] = df['message'].apply(process_text)
df.head()

Unnamed: 0,sentiment,message,tweetid,clean_text
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,"[polyscimajor, epa, chief, does not, think, ca..."
1,1,It's not like we lack evidence of anthropogeni...,126103,"[it is, not, like, we, lack, evidence, of, ant..."
2,2,RT @RawStory: Researchers say we have three ye...,698562,"[rt, @rawstory, :, researcher, say, we, have, ..."
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,"[#todayinmaker, #, wire, :, , be, a, pivotal, ..."
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,"[rt, @soynoviodetodas, :, it is, , ,, and, a, ..."


In [None]:
#Top 20 most frequent words for all the articles

cl_text_list = df['clean_text'].tolist()
wf = word_freq(cl_text_list, 20)
wf.head(20)

Unnamed: 0,0,1
0,change,12670
1,climate,12645
2,:,12575
3,rt,9720
4,.,8354
5,the,7766
6,be,7669
7,to,7186
8,",",6219
9,a,4934


In [None]:
#Avg word count by category

df['word_count'] = df['message'].apply(word_count)
avg_wc = df.groupby('sentiment').mean().reset_index()
avg_wc[['sentiment','word_count']]

Unnamed: 0,sentiment,word_count
0,-1,18.3
1,0,16.48
2,1,18.81
3,2,15.37


In [None]:
df['sentiment'].value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [None]:
#Preparing the dataframes

#Splitting the df into the different categories
df_negative = df.loc[df['sentiment'] == -1] 
df_neutral = df.loc[df['sentiment'] == 0]
df_positive = df.loc[df['sentiment'] == 1] 
df_news = df.loc[df['sentiment'] ==2]

#Randomly sampling business and sports to create imbalanced classes
#df_positive = df_positive.sample(n=3640, random_state=3)
#df_neutral = df_neutral.sample(n=1296, random_state=3)
#df_news = df_news.sample(n=1296, random_state=3)

#Holding out 5 articles from each class for prediction at the end
df_negative_holdout = df_negative.iloc[:5]
df_neutral_holdout = df_neutral.iloc[:5]
df_positive_holdout = df_positive.iloc[:5]
df_news_holdout = df_news.iloc[:5]

df_negative = df_negative.iloc[5:]
df_neutral = df_neutral.iloc[5:]
df_positive = df_positive.iloc[5:]
df_news = df_news.iloc[5:]

#Appending the dfs back together
df = pd.concat([df_negative, df_neutral, df_positive, df_news])
df_holdout = pd.concat([df_negative_holdout, df_neutral_holdout, df_positive_holdout, df_news_holdout])

#Turning the labels into numbers
LE = LabelEncoder()
df['label_num'] = LE.fit_transform(df['sentiment'])

display(df.groupby(['sentiment'])['message'].count())
display(df_holdout.groupby(['sentiment'])['message'].count())
display(df['sentiment'].unique())
display(df['label_num'].unique())

sentiment
-1    1291
 0    2348
 1    8525
 2    3635
Name: message, dtype: int64

sentiment
-1    5
 0    5
 1    5
 2    5
Name: message, dtype: int64

array([-1,  0,  1,  2])

array([0, 1, 2, 3])

In [None]:
#Top 15 words by category. Taking bigrams into account

top_n = 15

text_neg = df_negative['clean_text'].tolist()
text_neut = df_neutral['clean_text'].tolist()
text_pos = df_positive['clean_text'].tolist()
text_news = df_news['clean_text'].tolist()

neg = word_freq_bigrams(text_neg, top_n=top_n)
neut = word_freq_bigrams(text_neut, top_n=top_n)
pos = word_freq_bigrams(text_pos, top_n=top_n)
news = word_freq_bigrams(text_news, top_n=top_n)

df_wf = pd.concat([neg, neut, pos, news], axis=1)
cols = ['negative', 'count', 'neutral', 'count', 'positive', 'count', 'news', 'count']
df_wf.columns = cols
df_wf

Unnamed: 0,negative,count,neutral,count.1,positive,count.2,news,count.3
0,.,513,rt,966,climate_change,4053,climate_change,1721
1,:,502,:,775,:,3559,:,1216
2,rt,429,climate_change,754,rt,3027,rt,987
3,climate_change,420,.,732,.,2443,change,702
4,global_warming,317,",",468,…,1729,',556
5,…,305,global_warming,457,be,1647,…,463
6,be,299,be,382,",",1604,",",381
7,",",278,…,350,the,1036,.,317
8,',241,the,303,a,983,a,307
9,the,239,a,274,to,978,on_climate,298


In [None]:
#alot less mentions of climage change in negative tweets
#RT,'...', random charactars, 

In [None]:
#Top 15 bigrams by category

neg_bigrams = bigram_freq(text_neg, top_n = top_n)
neut_bigrams = bigram_freq(text_neut, top_n = top_n)
pos_bigrams = bigram_freq(text_pos, top_n = top_n)
news_bigrams = bigram_freq(text_news, top_n = top_n)

df_bigram_wf = pd.concat([neg_bigrams, neut_bigrams, pos_bigrams, news_bigrams], axis=1)
df_bigram_wf.columns = cols
df_bigram_wf

Unnamed: 0,negative,count,neutral,count.1,positive,count.2,news,count.3
0,climate_change,420,climate_change,754,climate_change,4053,climate_change,1721
1,global_warming,317,global_warming,457,¢_â,964,on_climate,298
2,:_the,89,‚_¬,144,‚_¬,956,global_warming,185
3,be_a,75,¢_â,143,â_¦,804,:_the,147
4,'_climate,67,global_warm,123,global_warming,738,¢_â,131
5,¢_â,59,â_¦,118,â_‚,704,‚_¬,131
6,‚_¬,58,on_climate,97,¬_â,678,’_s,130
7,change_',58,â_‚,92,in_climate,618,:_trump,125
8,â_¦,55,¬_â,83,change_.,595,of_climate,108
9,change_.,51,change_.,77,go_to,519,"change_,",104


In [None]:
#rt donald trump, retweet @stevesgoddard and man make have a strong correlation to negative

In [None]:
#Creating the features (tf-idf weights) for the processed text

texts = df['clean_text'].astype('str')

#tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
#                                   min_df = 2, 
#                                   max_df = .95)

#X = tfidf_vectorizer.fit_transform(texts) #features
X = texts
y = df['label_num'].values #target

print (X.shape)
print(y.shape)

(15799,)
(15799,)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)

In [None]:
LogReg = LogisticRegression()
LinSVC = LinearSVC()
NB = MultinomialNB()

In [None]:
text_clf = Pipeline([('tfidf',TfidfVectorizer(
                             min_df=3, 
                             max_df=0.5, 
                             ngram_range=(1, 3))),('clf',LinSVC)])

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix,classification_report
text_clf.fit(X_train,y_train)
predictions = text_clf.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

metrics.f1_score(y_test,predictions,average='micro')

[[ 125   38   83   17]
 [  24  182  193   33]
 [  32  106 1467  132]
 [   7   24  132  565]]
              precision    recall  f1-score   support

           0       0.66      0.48      0.55       263
           1       0.52      0.42      0.47       432
           2       0.78      0.84      0.81      1737
           3       0.76      0.78      0.77       728

    accuracy                           0.74      3160
   macro avg       0.68      0.63      0.65      3160
weighted avg       0.73      0.74      0.73      3160



0.7401898734177215

In [None]:
y_test.shape

(3160,)

In [None]:
X_test.shape

(3160,)

In [None]:
predictions.shape

(3160,)

In [None]:
comparison_test_df = pd.DataFrame(
    {'Actual Value': y_test,
     'Predicted Value': predictions,
     'Tweet': X_test
    })

In [None]:
comparison_test_df = comparison_test_df['']

Unnamed: 0,Actual Value,Predicted Value,Tweet
12232,2,2,"['rt', '@stephenschlegel', ':', 'she is', 'thi..."
10160,3,3,"['rt', '@justinkeeble', ':', 'china', 'warn', ..."
5869,1,1,"['rt', '@karoxxxx', ':', '🎵', 'it is', 'begin'..."
7512,2,2,"['hillary', '#clinton', 'position', 'on', 'cli..."
7498,2,2,"['rt', '@stephenschlegel', ':', 'she is', 'thi..."
9331,3,3,"['rt', '@coilltenews', ':', 'before', 'the', '..."
11985,3,3,"['rt', '@buzzfeednews', ':', 'some', 'republic..."
1907,2,2,"['@alberta', '@thestreet', 'that', 'company', ..."
4672,2,2,"['rt', '@washingtonpost', ':', 'opinion', ':',..."
1998,2,2,"['rt', '@world_wildlife', ':', 'weã', '¢', 'â'..."
