In [None]:
import nltk
import glob
import scipy
import numpy as np
import pandas as pd
from sklearn import svm, metrics
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier

<div class="alert alert-block alert-warning">
    1. Reading & PreProcessing Training Data
</div>

In [0]:
#ReadData              
Cols = ['ID','Label', 'Tweet']
data2013=pd.read_csv('twitter-2013train.txt',delimiter='\t', header=None,names=Cols)
data2015=pd.read_csv('twitter-2015train.txt',delimiter='\t', header=None,names=Cols)
data2016=pd.read_csv('twitter-2016train.txt',delimiter='\t', header=None,names=Cols)

Data = data2016.append(data2015).append(data2013)
#PreProcessing
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
stop = stopwords.words('english')
Data['Tweet'] = Data['Tweet'].replace({r'\\':'',r'\'':'',r'\,':'','&':'',r'\"':'','!':'','\.':'','u2019':'\'','u002c':',','(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)':''}, regex=True)
Data['Tweet'] = [ tweet.casefold() for tweet in Data['Tweet'] ]
Data['Tweet'] = Data['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Nv = pd.read_csv('negative-words.txt', names=['Negative'], delimiter='\n', encoding='latin-1')
Pv = pd.read_csv('positive-words.txt', names=['Positve'], delimiter='\n', encoding='latin-1')

<div class="alert alert-block alert-warning">
    2. Feature Engineering
</div>

In [0]:
Data['Negative'] = Data['Tweet'].apply(lambda x: [x.count(word) for word in x.split() if any( word in s for s in Nv['Negative'])])
Data['Negative'] = Data['Negative'].apply(lambda x: x.count(1))
Data['Positve'] = Data['Tweet'].apply(lambda x: [x.count(word) for word in x.split() if any( word in s for s in Pv['Positve'])])
Data['Positve'] = Data['Positve'].apply(lambda x: x.count(1))
Data['Length']=[len(x.split()) for x in Data.Tweet]
Data['AvgNegative'] = Data['Negative']/Data['Length']
Data['AvgPositve'] = Data['Positve']/Data['Length']
Data['Porality'] = Data['Tweet'].apply(lambda tweet: TextBlob(tweet).sentiment[0])
Data['Subjectivity'] = Data['Tweet'].apply(lambda tweet: TextBlob(tweet).sentiment[1])
Data[:10]

Unnamed: 0,ID,Label,Tweet,Negative,Positve,Length,AvgNegative,AvgPositve,Porality,Subjectivity
0,628949369883000832,negative,dear newooffice mac great lync update cmon,2,3,7,0.285714,0.428571,0.8,0.75
1,628976607420645377,negative,make system doesnt eat friggin discs 2nd time ...,5,2,10,0.5,0.2,-0.357143,0.428571
2,629023169169518592,negative,may ignorant issue celebrate parental leave ch...,3,2,12,0.25,0.166667,0.0,0.0
3,629179223232479232,negative,thanks may switching,1,0,3,0.333333,0.0,0.2,0.2
4,629186282179153920,neutral,make game windows10 universal app xboxone owne...,4,3,11,0.363636,0.272727,0.033333,0.341667
5,629226490152914944,positive,microsoft may prefer gaming branch business ma...,4,2,12,0.333333,0.166667,0.416667,0.5
6,629345637155360768,negative,downgrading let windows10 almost 1st yr b4 try...,2,1,9,0.222222,0.111111,0.0,0.0
7,629394528336637953,negative,2nd computer error windows10fail guess shelve sp1,1,0,7,0.142857,0.0,0.0,0.0
8,629650766580609026,positive,ordered 1st ever tablet surface pro 3 i78gb 51...,4,5,15,0.266667,0.333333,0.0,0.0
9,629797991826722816,negative,attempting reinstall still bricks says windows...,4,1,13,0.307692,0.076923,0.0,0.0


<div class="alert alert-block alert-warning">
    3. Term Frequency
</div>

In [0]:
vectorizer = TfidfVectorizer(stop_words='english').fit(Data.Tweet)
train_vectorized = vectorizer.transform(Data.Tweet)
#toDense
dense = train_vectorized.todense()
toapp = np.append(np.matrix(Data['AvgNegative']).T,np.matrix(Data['AvgPositve']).T,axis=1)
toapp = np.append(toapp,np.matrix(Data['Porality']).T,axis=1)
toapp = np.append(toapp,np.matrix(Data['Subjectivity']).T,axis=1)
train_vectorizedUp=np.append(dense,toapp,axis=1)
#toSparse
train_vectorizedUp=scipy.sparse.csr_matrix(train_vectorizedUp)
print (train_vectorizedUp.shape)

(16041, 25966)


<div class="alert alert-block alert-warning">
    4. 10x Cross Validation using SVM
</div>

In [0]:
train_data = train_vectorizedUp
train_labels = Data["Label"]

Clfr = svm.SVC(kernel='linear', C=1)
scores = cross_val_score(Clfr, train_data, train_labels, cv=10 )
fscores = cross_val_score(Clfr, train_data, train_labels, cv=10, scoring='f1_macro'  )
print (scores)
print ("Avg Accu: %0.3f (+/-%0.2f)" %(scores.mean(), scores.std() *2))
print (fscores)
print ("Avg F1: %0.3f (+/-%0.2f)" %(fscores.mean(), fscores.std() *2))

[0.44610592 0.35576324 0.23862928 0.49034268 0.61221945 0.64339152
 0.67394015 0.67373674 0.67623206 0.6338116 ]
Avg Accu: 0.544 (+/-0.29)
[0.39137264 0.32366716 0.24403991 0.45391259 0.55169271 0.5929479
 0.62890701 0.5993438  0.62246232 0.56006615]
Avg F1: 0.497 (+/-0.26)


<div class="alert alert-block alert-warning">
    5. 10x Cross Validation using Random Forest
</div>

In [0]:
Clf2 = RandomForestClassifier()

sc = ['accuracy', 'f1_macro']
Cv = cross_validate(Clf2, train_data, train_labels, cv=10, scoring=sc)

print("Accuracy: %0.10f (+/- %0.10f)" % (Cv['test_accuracy'].mean(), Cv.std()))
print ("Avg F1 Score: %0.3f (+/-%0.2f)" %(Cv['test_f1_macro'].mean(), Cv.std() *2))



Accuracy: 0.5427852006 (+/- 0.1345466631)
Avg F1 Score: 0.464 (+/-0.22)


<div class="alert alert-block alert-warning">
    6. 10x Cross Validation Bagging using Random Forest
</div>

In [0]:
model = BaggingClassifier(base_estimator=clf2, n_estimators=50, random_state=7)
Cv_Bagging = cross_val_score(model, train_data, train_labels, cv=10, scoring='accuracy')
print("Bagging using Random Forest")
print("Accuracy: %0.10f (+/- %0.10f)" % (scores.mean(), scores.std()))



Bagging using Random Forest
Accuracy: 0.5605537911 (+/- 0.1345466631)


<div class="alert alert-block alert-warning">
    7. PreProcessing Testing Data
</div>

In [0]:
#ReadData              
Test = pd.read_csv('test.csv', header=0)
#PreProcessing
Test['tweet'] = Test['tweet'].replace({r'\\':'',r'\'':'',r'\,':'','&':'',r'\"':'','!':'','\.':'','u2019':'\'','u002c':',','(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)':''}, regex=True)
Test['tweet'] = [ tweet.casefold() for tweet in Test['tweet'] ]
Test['tweet'] = Test['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Test['Negative'] = Test['tweet'].apply(lambda x: [x.count(word) for word in x.split() if any( word in s for s in Nv['Negative'])])
Test['Negative'] = Test['Negative'].apply(lambda x: x.count(1))
Test['Positve'] = Test['tweet'].apply(lambda x: [x.count(word) for word in x.split() if any( word in s for s in Pv['Positve'])])
Test['Positve'] = Test['Positve'].apply(lambda x: x.count(1))
Test['Length']=[len(x.split()) for x in Test.tweet]
Test['AvgNegative'] = Test['Negative']/Test['Length']
Test['AvgPositve'] = Test['Positve']/Test['Length']
Test['Porality'] = Test['tweet'].apply(lambda tweet: TextBlob(tweet).sentiment[0])
Test['Subjectivity'] = Test['tweet'].apply(lambda tweet: TextBlob(tweet).sentiment[1])
Test[:10]

Unnamed: 0,id,tweet,Negative,Positve,Length,AvgNegative,AvgPositve,Porality,Subjectivity
0,218775148495515649,musical awareness great big beautiful tomorrow...,3,3,8,0.375,0.375,0.4125,0.4625
1,258965201766998017,radio786 1004fm 710 fri oct 19 labour analyst ...,4,2,15,0.266667,0.133333,0.0,0.0
2,262926411352903682,kapan sih lo ngebuktiinjan ngomong doang susah...,2,2,20,0.1,0.1,0.2,0.8
3,254948834910818305,tomorrow come hear debate navigating new highe...,3,4,13,0.230769,0.307692,0.193182,0.477273
4,171874368908050432,excuse connectivity live stream baba amr many ...,4,4,14,0.285714,0.285714,0.180682,0.3875
5,256010056942903296,show love local field amp might win award gall...,4,5,15,0.266667,0.333333,0.325,0.35
6,261776619146985472,tell update apple tv 3rd gen becomes available...,4,4,14,0.285714,0.285714,0.05,0.1125
7,264143999374356481,crown filthy mcnastys katy dalys duke york bel...,3,2,14,0.214286,0.142857,-0.8,1.0
8,223052929131757571,uncover eternal city return flights rome trave...,3,2,15,0.2,0.133333,0.0,0.0
9,264088575476391936,cre blog oklahoma per square foot returns blog...,3,3,14,0.214286,0.214286,0.25,0.25


<div class="alert alert-block alert-warning">
    8. Term Frequency
</div>

In [0]:
test_vectorized = vectorizer.transform(Test.tweet)
#toDense
densex = test_vectorized.todense()
toappx = np.append(np.matrix(Test['AvgNegative']).T,np.matrix(Test['AvgPositve']).T,axis=1)
toappx = np.append(toappx,np.matrix(Test['Porality']).T,axis=1)
toappx = np.append(toappx,np.matrix(Test['Subjectivity']).T,axis=1)
test_vectorizedUp = np.append(densex,toappx,axis=1)
#toSparse
test_vectorizedUp = scipy.sparse.csr_matrix(test_vectorizedUp)
print (test_vectorizedUp.shape)

(3096, 25966)


<div class="alert alert-block alert-warning">
    9. Testing
</div>

In [0]:
def cond(i):
    if     i == 'neutral': return 0
    elif   i == 'positive': return 1
    elif   i == 'negative': return 2
    return i
#Reading Test and Test Labels for evaluating results
Subm = pd.DataFrame([])
Subm['id'] = Test['id']
Subm['label'] = [cond(label) for label in predicted]
#print(test_labels[:20])
Subm

Subm.to_csv('v1.csv', index = False)

In [0]:
bagging = model.fit(train_data,train_labels)
predicted = bagging.predict(test_vectorizedUp)

answer = pd.read_csv('answer_key.csv',header=0)
print (metrics.classification_report(answer['label'],Subm['label']))

              precision    recall  f1-score   support

           0       0.60      0.78      0.68      1374
           1       0.68      0.69      0.69      1262
           2       0.90      0.06      0.11       460

    accuracy                           0.64      3096
   macro avg       0.73      0.51      0.49      3096
weighted avg       0.68      0.64      0.60      3096

