In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns
import re
%matplotlib inline
import spacy
import nltk
import string


In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [5]:
def removing_stopwords(post):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(post) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
            # initialize an empty string 
    str1 = " "  
    sentence = str1.join(filtered_sentence)
    
    # return string   
    return sentence 

In [6]:
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])

In [7]:
def remove_urls(data):
    pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
    subs_url = r'url-web'
    data['message'] = data['message'].replace(to_replace = pattern_url, value = subs_url, regex = True)
    return data

In [8]:
def compound_column(data):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    data['scores'] = data['message'].apply(lambda message: sid.polarity_scores(message))
    data['compound'] = data['scores'].apply(lambda d:d['compound'])
    data = data.drop(['scores'],axis=1)
    return data

In [9]:
def length_of_message(data):
    data['length'] = data['message'].apply(lambda x:len(x))
    return data

In [10]:
def metric_evaluation(y_test,predictions):
    from sklearn.metrics import confusion_matrix,classification_report
    cfn_m = confusion_matrix(y_test,predictions)
    c_r = classification_report(y_test,predictions)
    from sklearn import metrics
    accuracy = metrics.accuracy_score(y_test,predictions)
    print(cfn_m)
    print(c_r)
    print(accuracy)

In [11]:
from sklearn.model_selection import train_test_split

In [32]:
a = train.copy()
a = remove_urls(a)
a['message'] = a['message'].apply(remove_punctuation_numbers)
a['message'] = a['message'].apply(removing_stopwords)


In [33]:
a.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesnt think carbon dio...,625221
1,1,Its like lack evidence anthropogenic global wa...,126103
2,2,RT RawStory Researchers say three years act cl...,698562
3,1,TodayinMaker WIRED pivotal year war climate ch...,573736
4,1,RT SoyNovioDeTodas Its racist sexist climate c...,466954


In [30]:
removing_stopwords('This is a test to remove the stop words')

'This test remove stop words'

In [24]:
po = 'This is a test to remove the stop words'
word_tokenize(po)

['This', 'is', 'a', 'test', 'to', 'remove', 'the', 'stop', 'words']

In [34]:
X = a['message']
y = a['sentiment']

In [35]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [49]:
X_train

6735     AlJazeera English HDLiveStreamClimate SOS Inno...
13537    RT kylegriffin Tillerson moving eliminate leas...
6876     RT nowthisnews Show terrifyingly alarming phot...
10035    RT tpoliticalnews A majority Republicans House...
5174     ClimateNPS MAN STUPID Ã¢â‚¬â€ A powerful musi...
5001     Carbon dioxide biggest contributor global warm...
6887     RT jwalkenrdc Scott Pruitt ’ office deluged an...
4747     With crisis climate change comes incredible op...
9473     RT juiceDiem Before I go bed If think flag bur...
2133     arnoldcam VanJones reform combat climate chang...
10565    RT SenSanders We presidentelect doesnÃ¢â‚¬â „ ...
11723    Depression anxiety PTSD The mental impact clim...
1626     It took degree temperature November ocean reef...
5653     Pope urges world leaders hobble climate change...
11444    RT wef This major Canadian river dried four da...
1237     RT PaulHindley Sorry Paul Nuttall working clas...
7308     Well stop climate change Trump proposes steep .

In [50]:
# count vectorization which builds a dictionary of features
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

In [51]:
X_train_counts

<12655x20925 sparse matrix of type '<class 'numpy.int64'>'
	with 156867 stored elements in Compressed Sparse Row format>

In [52]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

In [54]:
X_train_tfidf.shape

(12655, 20925)

In [37]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC()

In [38]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])
text_clf.fit(X_train,y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [39]:
predictions = text_clf.predict(X_test)

In [40]:
metric_evaluation(y_test,predictions)

[[ 130   36  100   12]
 [  22  185  184   34]
 [  26   86 1488  155]
 [   4   10  129  563]]
              precision    recall  f1-score   support

          -1       0.71      0.47      0.57       278
           0       0.58      0.44      0.50       425
           1       0.78      0.85      0.81      1755
           2       0.74      0.80      0.77       706

    accuracy                           0.75      3164
   macro avg       0.70      0.64      0.66      3164
weighted avg       0.74      0.75      0.74      3164

0.7477876106194691


# unseen

In [41]:
unseen = test.copy()
unseen = remove_urls(unseen)
unseen['message'] = unseen['message'].apply(remove_punctuation_numbers)
unseen['message'] = unseen['message'].apply(removing_stopwords)

In [42]:
predict_unseen = text_clf.predict(unseen['message'])

In [43]:
results = pd.DataFrame()

In [44]:
results['tweetid'] = test['tweetid']
results['sentiment'] = predict_unseen
results.reset_index(drop=True,inplace=True)

In [45]:
results.to_csv(r"resultsLinear.csv",sep=',',index=False)
len(results)

10546

In [66]:
# Find a duplicate rows
b = train['message']
duplicateDFRow = b[b.duplicated()]
print(duplicateDFRow)

51       RT @StephenSchlegel: she's thinking about how ...
70       RT @kelkulus: Irony: Florida, a state in dange...
98       RT @StephenSchlegel: she's thinking about how ...
112      RT @SenSanders: We have a president-elect who ...
122      RT @StephenSchlegel: she's thinking about how ...
123      RT @SethMacFarlane: HRC proposes installing ha...
142      RT @StephenSchlegel: she's thinking about how ...
184      RT @StephenSchlegel: she's thinking about how ...
224      RT @StephenSchlegel: she's thinking about how ...
240      RT @StephenSchlegel: she's thinking about how ...
246      RT @SenSanders: We have a president-elect who ...
257      RT @BernieSanders: #ImVotingBecause the future...
269      RT @CounterMoonbat: The people who predicted p...
317      RT @SenSanders: We have a president-elect who ...
346      RT @NatGeoChannel: Watch #BeforeTheFlood right...
363      RT @SenSanders: We have a president-elect who ...
392      RT @NatGeoChannel: Watch #BeforeTheFlood right.