In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,log_loss
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from xgboost import XGBClassifier
from warnings import simplefilter

In [2]:
def analize_sentiment(tweet):    
    analysis = TextBlob((str(tweet)))     #defining the function which will find the plority of a sentence
    return analysis.polarity

In [3]:
simplefilter(action='ignore', category=FutureWarning)

In [4]:
df= pd.read_csv('Train_Data.csv')
# SPLITTING THE DATASET INTO TRAINING AND TESTING
train = df[df['Date'] < '2011-05-09']
test = df[df['Date'] > '2011-05-18']

# CONVERT THE TRAINNG DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
train_list = []
for row in range(0,len(train.index)): 
    train_list.append(' '.join(str(k) for k in train.iloc[row,2:27]))
    
# DEFINING THE VECTOR FUNCTION, SPECIFYING THR MIN AND MAX WORD FREQUENCY FILTER  
vectorize= CountVectorizer(min_df=0.01, max_df=0.8)

# TRANSFORMING THE TRAINING DATASET INTO WORD FREQUENCY TRANFORMATION
train_vector = vectorize.fit_transform(train_list) 
print( "THE TABLE OF FREQUENCY WORD DISTRIBUTION" , train_vector.shape)

THE TABLE OF FREQUENCY WORD DISTRIBUTION (692, 4514)


In [5]:
lr=LogisticRegression()

model = lr.fit(train_vector, train["Label"])

test_list = []

# CONVERT THE TESTING DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
for row in range(0,len(test.index)):
    test_list.append(' '.join(str(x) for x in test.iloc[row,2:27]))
  
# TRANSFORMING THE TESTING DATASET INTO WORD FREQUENCY TRANFORMATION
test_vector = vectorize.transform(test_list) 

predictions = model.predict(test_vector)

pd.crosstab(test["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

accuracy1=accuracy_score(test['Label'], predictions)

print("the baseline model accuracy", accuracy1)

# WORD DISTRIBUTION OF THE MODEL
words = vectorize.get_feature_names()
coefficients = model.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : words,'Coefficient' : coefficients})  

coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print("Top ten words according to the baseline model",coeffdf.head(10))
print("Last ten words according to the baseline model",coeffdf.tail(10))


# DEFINING THE TFID TRANSFORMATION FUNCTION
nvectorize = TfidfVectorizer(min_df=0.05, max_df=0.85,ngram_range=(2,2)) 
news_nvector = nvectorize.fit_transform(train_list)

print(" TFID TRANSFOMATION DATAFRAME SHAPE",news_nvector.shape)

nmodel = lr.fit(news_nvector, train["Label"])

the baseline model accuracy 0.5144508670520231
Top ten words according to the baseline model             Word  Coefficient
3031   political     0.563931
400   australian     0.494684
1690      forced     0.462982
2396        life     0.452875
2218      jewish     0.436600
4341      wanted     0.434160
1985       homes     0.426856
1545       faces     0.425054
3708          so     0.422071
3805       state     0.414759
Last ten words according to the baseline model           Word  Coefficient
4274  vehicles    -0.373066
2197     italy    -0.391548
3544    search    -0.391845
65          40    -0.393927
2015       how    -0.442358
517        big    -0.472416
692     canada    -0.489905
2670        my    -0.491770
2483      many    -0.493135
2698      need    -0.513944
 TFID TRANSFOMATION DATAFRAME SHAPE (692, 266)


In [6]:
test_list = []
# CONVERT THE TESTING DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
for row in range(0,len(test.index)):
    test_list.append(' '.join(str(x) for x in test.iloc[row,2:27]))
 
ntest_vector = nvectorize.transform(test_list)
npredictions = nmodel.predict(ntest_vector)

pd.crosstab(test["Label"], npredictions, rownames=["Actual"], colnames=["Predicted"])

accuracy2=accuracy_score(test['Label'], npredictions)
print("Logistics Regression with Bigram and TFID",accuracy2)

nwords = nvectorize.get_feature_names()
ncoefficients = nmodel.coef_.tolist()[0]
ncoeffdf = pd.DataFrame({'Word' : nwords,'Coefficient' : ncoefficients})
ncoeffdf = ncoeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
ncoeffdf.head(10)
ncoeffdf.tail(10)


nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.95,ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_list)

#DEFINNG THE RANDOM FOREST MODEL
rfmodel = RandomForestClassifier(random_state=55)  
rfmodel = rfmodel.fit(news_nvector, train["Label"])
test_list = []
for row in range(0,len(test.index)):
    test_list.append(' '.join(str(x) for x in test.iloc[row,2:27]))
ntest_vector = nvectorize.transform(test_list)

rfpredictions = rfmodel.predict(ntest_vector)
accuracyrf = accuracy_score(test['Label'], rfpredictions)
print("Random forest with tfid and bigram", accuracyrf)

Logistics Regression with Bigram and TFID 0.5361271676300579
Random forest with tfid and bigram 0.5


In [7]:
# DEFINING THE NAIVE BAYS MODEL
nvectorize = TfidfVectorizer(min_df=0.05, max_df=0.8,ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_list)

nbmodel = MultinomialNB(alpha=0.5)
nbmodel = nbmodel.fit(news_nvector, train["Label"])

# CONVERT THE TESTING DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
test_list = []
for row in range(0,len(test.index)):
    test_list.append(' '.join(str(x) for x in test.iloc[row,2:27]))
    
ntest_vector = nvectorize.transform(test_list)

nbpredictions = nbmodel.predict(ntest_vector)
nbaccuracy=accuracy_score(test['Label'], nbpredictions)

print("Naive Bayes accuracy: ",nbaccuracy)

#author: Shravan Chintha
#Gradient Boosting Classifier
# DEFINING THE GARDIANT BOOSTING MODEL
gbmodel = GradientBoostingClassifier(random_state=52) 
gbmodel = gbmodel.fit(news_nvector, train["Label"])
test_list = []
for row in range(0,len(test.index)):
    test_list.append(' '.join(str(x) for x in test.iloc[row,2:27]))
ntest_vector = nvectorize.transform(test_list)

gbpredictions = gbmodel.predict(ntest_vector.toarray())
gbaccuracy = accuracy_score(test['Label'], gbpredictions)

from sklearn.metrics import log_loss
logloss = log_loss(test['Label'], gbpredictions, eps=1e-15)
print("gbpredictions" ,logloss)

from sklearn.metrics import confusion_matrix
print(" CONFUSION MATRIX OF THE GRADIANT BOOSTING ", confusion_matrix(test['Label'], gbpredictions))

print("Gradient Boosting accuracy: ",gbaccuracy)


# DEFINING THE TFID , TRIGRAM MODEL
n3vectorize = TfidfVectorizer(min_df=0.0004, max_df=0.115,ngram_range=(3,3)) 
news_n3vector = n3vectorize.fit_transform(train_list)

print(news_n3vector.shape)

n3model = lr.fit(news_n3vector, train["Label"])

# CONVERT THE TESTING DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
test_list = []
for row in range(0,len(test.index)):
    test_list.append(' '.join(str(x) for x in test.iloc[row,2:27])) 
    
n3test_vector = n3vectorize.transform(test_list)
n3predictions = n3model.predict(n3test_vector)

pd.crosstab(test["Label"], n3predictions, rownames=["Actual"], colnames=["Predicted"])

accuracy3=accuracy_score(test['Label'], n3predictions)

print("TRIGARAM ACCURACY", accuracy3)

from sklearn.metrics import log_loss
logloss = log_loss(test['Label'], n3predictions, eps=1e-15)
print(logloss)

 # trigram model word distribution
n3words = n3vectorize.get_feature_names()
n3coefficients = n3model.coef_.tolist()[0]
n3coeffdf = pd.DataFrame({'Word' : n3words,'Coefficient' : n3coefficients})
n3coeffdf = n3coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print("trigram top ten word distibution", n3coeffdf.head(10))
print("trigram last ten word distibution", n3coeffdf.tail(10))  

Naive Bayes accuracy:  0.5433526011560693
gbpredictions 17.0700274965675
 CONFUSION MATRIX OF THE GRADIANT BOOSTING  [[ 68 247]
 [ 95 282]]
Gradient Boosting accuracy:  0.5057803468208093
(692, 251230)
TRIGARAM ACCURACY 0.5447976878612717
15.722494852004878
trigram top ten word distibution                           Word  Coefficient
244050             will not be     0.132308
101163             in favor of     0.121931
112457          israel and the     0.117012
97394       human rights watch     0.116544
141374       nobel peace prize     0.104012
139891          new york times     0.103986
78739   founder julian assange     0.100721
24006              at least 10     0.097888
149833             of the most     0.097688
89831           has been found     0.097416
trigram last ten word distibution                     Word  Coefficient
205544   the conflict in    -0.119483
219430       to death in    -0.128549
102930       in order to    -0.132749
222023        to pay for    -0.136040
2

In [8]:
train_sentiment=train
test_sentiment = test
train_sentiment =train_sentiment.drop(['Date', 'Label','key'], axis=1)
for column in train_sentiment:
    #converting the train headlines into polarity scores
    train_sentiment[column]=train_sentiment[column].apply(analize_sentiment)  
    # removing negative co:efficient from the datset for better performance
    train_sentiment = train_sentiment


test_sentiment =test_sentiment.drop(['Date', 'Label','key'], axis=1)
for column in test_sentiment:
    test_sentiment[column]=test_sentiment[column].apply(analize_sentiment) 
    test_sentiment=test_sentiment 
# training the polarity score datset with DIJA 
XGB_model= XGBClassifier()  
gradiant=XGB_model.fit(train_sentiment, train['Label'])
y_pred= gradiant.predict(test_sentiment)


from sklearn.metrics import confusion_matrix
print(confusion_matrix(test['Label'], y_pred))
from sklearn.metrics import accuracy_score
print("Sentiment Accuracy",accuracy_score(test['Label'], y_pred))
from sklearn.metrics import f1_score
print("f1_score__",f1_score(test['Label'], y_pred, average='weighted'))
from sklearn.metrics import log_loss
logloss = log_loss(test['Label'], y_pred, eps=1e-15)
print(logloss)


[[ 96 219]
 [109 268]]
Sentiment Accuracy 0.5260115606936416
f1_score__ 0.506051043262026
16.371233770762885
