In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from xgboost import XGBClassifier

In [2]:
def analize_sentiment(tweet):
    
    analysis = TextBlob((str(tweet)))     #defining the function which will find the plority of a sentence
    return analysis.polarity 

In [3]:

news=pd.read_csv('/Users/karanpurswani/Documents/practicals/Stock-Market-prediction-Using-Daily-News-Headlines/Combined_News_DJIA.csv')

train_news = news[news['Date'] < '2014-07-15']   # SPLITTING THE DATASET INTO TRAINING AND TESTING
test_news = news[news['Date'] > '2014-07-14']

train_news_list = []
for row in range(0,len(train_news.index)): # CONVERT THE TRAINNG DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
    train_news_list.append(' '.join(str(k) for k in train_news.iloc[row,2:27]))
    
vectorize= CountVectorizer(min_df=0.01, max_df=0.8) # DEFINING THE VECTOR FUNCTION, SPECIFYING THR MIN AND MAX WORD FREQUENCY FILTER
news_vector = vectorize.fit_transform(train_news_list) # TRANSFORMING THE TRAINING DATASET INTO WORD FREQUENCY TRANFORMATION
print( "THE TABLE OF FREQUENCY WORD DISTRIBUTION" , news_vector.shape)

THE TABLE OF FREQUENCY WORD DISTRIBUTION (1492, 4733)


In [4]:
lr=LogisticRegression()
model = lr.fit(news_vector, train_news["Label"])

test_news_list = []
for row in range(0,len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:27]))# CONVERT THE TESTING DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY

test_vector = vectorize.transform(test_news_list) # TRANSFORMING THE TESTING DATASET INTO WORD FREQUENCY TRANFORMATION

predictions = model.predict(test_vector)

pd.crosstab(test_news["Label"], predictions, rownames=["Actual"], colnames=["Predicted"])

accuracy1=accuracy_score(test_news['Label'], predictions)
print("the baseline model accuracy", accuracy1)

words = vectorize.get_feature_names()
coefficients = model.coef_.tolist()[0]
coeffdf = pd.DataFrame({'Word' : words,'Coefficient' : coefficients})  # WORD DISTRIBUTION OF THE MODEL

coeffdf = coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print("Top ten words according to the baseline model",coeffdf.head(10))
print("Last ten words according to the baseline model",coeffdf.tail(10))



nvectorize = TfidfVectorizer(min_df=0.05, max_df=0.85,ngram_range=(2,2)) # DEFINING THE TFID TRANSFORMATION FUNCTION
news_nvector = nvectorize.fit_transform(train_news_list)

print(" TFID TRANSFOMATION DATAFRAME SHAPE",news_nvector.shape)

nmodel = lr.fit(news_nvector, train_news["Label"])



the baseline model accuracy 0.4607645875251509
Top ten words according to the baseline model           Word  Coefficient
3728      self     0.628725
4647      wing     0.533285
2090  hospital     0.533244
2392     kills     0.528196
4387      turn     0.518748
284      among     0.516599
762     cartel     0.514192
2929  olympics     0.508684
1146  damascus     0.508380
3585      rise     0.504063
Last ten words according to the baseline model            Word  Coefficient
3770        sex    -0.538371
1163         de    -0.542079
990       congo    -0.545630
4206     terror    -0.554574
4047   students    -0.562809
3653  sanctions    -0.570902
2100      hours    -0.571660
506       begin    -0.603627
4301      total    -0.610774
3626        run    -0.663518
 TFID TRANSFOMATION DATAFRAME SHAPE (1492, 284)




In [5]:
test_news_list = []
for row in range(0,len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:27])) # CONVERT THE TESTING DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
ntest_vector = nvectorize.transform(test_news_list)
npredictions = nmodel.predict(ntest_vector)

pd.crosstab(test_news["Label"], npredictions, rownames=["Actual"], colnames=["Predicted"])

accuracy2=accuracy_score(test_news['Label'], npredictions)
print(" Logistics Regression with Bigram and TFID",accuracy2)

nwords = nvectorize.get_feature_names()
ncoefficients = nmodel.coef_.tolist()[0]
ncoeffdf = pd.DataFrame({'Word' : nwords, 
                        'Coefficient' : ncoefficients})
ncoeffdf = ncoeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
ncoeffdf.head(10)
ncoeffdf.tail(10)


nvectorize = TfidfVectorizer(min_df=0.01, max_df=0.95,ngram_range=(2,2))
news_nvector = nvectorize.fit_transform(train_news_list)

rfmodel = RandomForestClassifier(random_state=55)  #DEFINNG THE RANDOM FOREST MODEL
rfmodel = rfmodel.fit(news_nvector, train_news["Label"])
test_news_list = []
for row in range(0,len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:27]))
ntest_vector = nvectorize.transform(test_news_list)

rfpredictions = rfmodel.predict(ntest_vector)
accuracyrf = accuracy_score(test_news['Label'], rfpredictions)
print("Random forest with tfid and bigram", accuracyrf)

 Logistics Regression with Bigram and TFID 0.5311871227364185




Random forest with tfid and bigram 0.545271629778672


In [6]:


nvectorize = TfidfVectorizer(min_df=0.05, max_df=0.8,ngram_range=(2,2)) #DEFINING THE NAIVE BAYS MODEL
news_nvector = nvectorize.fit_transform(train_news_list)

nbmodel = MultinomialNB(alpha=0.5)
nbmodel = nbmodel.fit(news_nvector, train_news["Label"])

test_news_list = []
for row in range(0,len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:27])) # CONVERT THE TESTING DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
ntest_vector = nvectorize.transform(test_news_list)

nbpredictions = nbmodel.predict(ntest_vector)
nbaccuracy=accuracy_score(test_news['Label'], nbpredictions)
print("Naive Bayes accuracy: ",nbaccuracy)

#author: Shravan Chintha
#Gradient Boosting Classifier

gbmodel = GradientBoostingClassifier(random_state=52)  # DEFINING THE GARDIANT BOOSTING MODEL
gbmodel = gbmodel.fit(news_nvector, train_news["Label"])
test_news_list = []
for row in range(0,len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:27]))
ntest_vector = nvectorize.transform(test_news_list)

gbpredictions = gbmodel.predict(ntest_vector.toarray())
gbaccuracy = accuracy_score(test_news['Label'], gbpredictions)

from sklearn.metrics import confusion_matrix
print(" CONFUSION MATRIX OF THE GRADIANT BOOSTING ", confusion_matrix(test_news['Label'], gbpredictions))


print("Gradient Boosting accuracy: ",gbaccuracy)



n3vectorize = TfidfVectorizer(min_df=0.0004, max_df=0.115,ngram_range=(3,3)) # DEFINING THE TFID , TRIGRAM MODEL
news_n3vector = n3vectorize.fit_transform(train_news_list)

print(news_n3vector.shape)

n3model = lr.fit(news_n3vector, train_news["Label"])

test_news_list = []
for row in range(0,len(test_news.index)):
    test_news_list.append(' '.join(str(x) for x in test_news.iloc[row,2:27])) # CONVERT THE TESTING DATASET OF 27 COLUMNS INTO ONE ELEMENT IN THE LIST FOR EACH DAY
n3test_vector = n3vectorize.transform(test_news_list)
n3predictions = n3model.predict(n3test_vector)

pd.crosstab(test_news["Label"], n3predictions, rownames=["Actual"], colnames=["Predicted"])

accuracy3=accuracy_score(test_news['Label'], n3predictions)
print("TRIGARAM ACCURACY", accuracy3)

n3words = n3vectorize.get_feature_names()
n3coefficients = n3model.coef_.tolist()[0]
n3coeffdf = pd.DataFrame({'Word' : n3words, 
                        'Coefficient' : n3coefficients})
n3coeffdf = n3coeffdf.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print("trigram top ten word distibution", n3coeffdf.head(10))
print("trigram last ten word distibution", n3coeffdf.tail(10))    # trigram model word distribution 

Naive Bayes accuracy:  0.5291750503018109
 CONFUSION MATRIX OF THE GRADIANT BOOSTING  [[ 92 148]
 [ 74 183]]
Gradient Boosting accuracy:  0.5533199195171026
(1492, 569061)




TRIGARAM ACCURACY 0.5171026156941649
trigram top ten word distibution                       Word  Coefficient
509383           to the us     0.201466
481307        the right to     0.170945
322285   nobel peace prize     0.166078
223934  human rights watch     0.159698
491158         this is the     0.154684
240342        in west bank     0.151686
230935        in china the     0.139410
518984         turn out to     0.138018
239146     in the occupied     0.132465
321584         no fly zone     0.127306
trigram last ten word distibution                          Word  Coefficient
183898      freedom of speech    -0.141464
356524        osama bin laden    -0.141908
371338  phone hacking scandal    -0.147679
497344              to be the    -0.148085
207018      has been arrested    -0.151972
509742              to try to    -0.152776
334728        of human rights    -0.170488
416292             said to be    -0.191689
48303        around the world    -0.195347
238814         in the coun

In [7]:
train_sentiment=train_news
test_sentiment = test_news
train_sentiment =train_sentiment.drop(['Date', 'Label'], axis=1)
for column in train_sentiment:
    train_sentiment[column]=train_sentiment[column].apply(analize_sentiment)  #converting the train headlines into polarity scores
train_sentiment = train_sentiment+10  # removing negative co:efficient from the datset for better performance

test_sentiment =test_sentiment.drop(['Date', 'Label'], axis=1)
for column in test_sentiment:
    test_sentiment[column]=test_sentiment[column].apply(analize_sentiment) # converting the test headlines into ploarity 
test_sentiment=test_sentiment+10 # removing negative co:efficient from the datset for better performance

XGB_model= XGBClassifier()  # training the polarity score datset with DIJA 
gradiant=XGB_model.fit(train_sentiment, train_news['Label'])
y_pred= gradiant.predict(test_sentiment)


from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_news['Label'], y_pred))
from sklearn.metrics import accuracy_score
print("Sentiment Accuracy",accuracy_score(test_news['Label'], y_pred))
from sklearn.metrics import f1_score
print("f1_score__",f1_score(test_news['Label'], y_pred, average='weighted'))

######################END####################


[[ 75 165]
 [ 70 187]]
Sentiment Accuracy 0.5271629778672032
f1_score__ 0.5057056775644162
