In [1]:
import re
import pandas as pd
pd.set_option("display.max_colwidth", 200)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
get_ipython().magic('matplotlib inline')

In [2]:
train = pd.read_csv("train_file.csv")
train = train.drop(['IDLink', 'Source', 'PublishDate', 'Topic', 'Facebook', 'GooglePlus', 'LinkedIn'], axis=1)
train.head()

Unnamed: 0,Title,Headline,SentimentTitle,SentimentHeadline
0,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemetery. President Barack Obama has laid a wreath at the Tomb of the Unknowns to honor,0.0,-0.0533
1,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit head for fixed income at Gam, discusses the China beige book and the state of the economy.",0.208333,-0.156386
2,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at Roubini Global Economics, explains why the global economy isn't facing the same conditions",-0.42521,0.139754
3,Finland GDP Expands In Q4,"Finland's economy expanded marginally in the three months ended December, after contracting in the previous quarter, preliminary figures from Statistics Finland showed Monday.",0.0,0.026064
4,"Tourism, govt spending buoys Thai economy in January","Tourism and public spending continued to boost the economy in January, in light of contraction in private consumption and exports, according to the Bank of Thailand data.",0.0,0.141084


In [3]:
test = pd.read_csv("test_file.csv")
test = test.drop(['Source', 'PublishDate', 'Topic', 'Facebook', 'GooglePlus', 'LinkedIn'], axis=1)
test.head()

Unnamed: 0,IDLink,Title,Headline
0,tFrqIR6Chj,Sliding Economy: FG fights back with N3trn TSA funds,"With the 2016 budget now passed by the National Assembly and a N3trillion war chest, the government of President Muhammadu Buhari says"
1,DVAaGErjlF,Microsoft shows how HoloLens can bring distant family members ...,A recent Microsoft Research video shows how the $3000 augmented reality system can be used to transmit 3D models of people anywhere in
2,OT9UIZm5M2,"Microsoft’s Twitter Robot Praises Hitler, Trump & Recites Racism","* Microsoft teamed with Bing to create TayTweets, an account for a robot that was designed to learn about “conversational understanding,” by having automated discussions with Twitter users, and mi..."
3,lflGp3q2Fj,Flood of Central Bank Moves Can't Get World Economy Out of Rut,Central bankers have managed to steer the world economy clear of a recession while leaving it stuck in the same rut that led to its troubles in the first place.
4,zDYG0SoovZ,USD/JPY: bears lining up on mixed U.S. economy outlook,"However, this streak of seven-day gains might end here as markets take a step back and ponder in respect of the US economy and its inflation"


In [4]:
train.shape, test.shape

((55932, 4), (37288, 3))

In [5]:
from textblob import Word
stop_words = set(stopwords.words('english'))
result = []
title = pd.DataFrame()
for s in train['Title'].values:
    tokens = word_tokenize(s)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stemmed = [Word(word).lemmatize() for word in words]
    result.append([i for i in stemmed if not i in stop_words])
title['result'] = result
title = pd.DataFrame(title)
title.head()

Unnamed: 0,result
0,"[obama, lay, wreath, arlington, national, cemetery]"
1,"[look, health, chinese, economy]"
2,"[nouriel, roubini, global, economy, back]"
3,"[finland, gdp, expands]"
4,"[tourism, govt, spending, buoy, thai, economy, january]"


In [6]:
stop_words = set(stopwords.words('english'))
result1 = []
headline = pd.DataFrame()
for s in train['Headline'].values:
    tokens = word_tokenize(s)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stemmed = [Word(word).lemmatize() for word in words]
    result1.append([i for i in stemmed if not i in stop_words])
headline['result'] = result1
headline = pd.DataFrame(headline)
headline.head()

Unnamed: 0,result
0,"[obama, lay, wreath, arlington, national, cemetery, president, barack, obama, ha, laid, wreath, tomb, unknown, honor]"
1,"[tim, haywood, investment, director, businessunit, head, fixed, income, gam, discus, china, beige, book, state, economy]"
2,"[nouriel, roubini, nyu, professor, chairman, roubini, global, economics, explains, global, economy, nt, facing, condition]"
3,"[finland, economy, expanded, marginally, three, month, ended, december, contracting, previous, quarter, preliminary, figure, statistic, finland, showed, monday]"
4,"[tourism, public, spending, continued, boost, economy, january, light, contraction, private, consumption, export, according, bank, thailand, data]"


In [7]:
for i in range(len(title['result'])):
    title['result'][i] = TreebankWordDetokenizer().detokenize(title['result'][i])
title['result'] = title
title.head()

Unnamed: 0,result
0,obama lay wreath arlington national cemetery
1,look health chinese economy
2,nouriel roubini global economy back
3,finland gdp expands
4,tourism govt spending buoy thai economy january


In [8]:
for i in range(len(headline['result'])):
    headline['result'][i] = TreebankWordDetokenizer().detokenize(headline['result'][i])
headline['result'] = headline
headline.head()

Unnamed: 0,result
0,obama lay wreath arlington national cemetery president barack obama ha laid wreath tomb unknown honor
1,tim haywood investment director businessunit head fixed income gam discus china beige book state economy
2,nouriel roubini nyu professor chairman roubini global economics explains global economy nt facing condition
3,finland economy expanded marginally three month ended december contracting previous quarter preliminary figure statistic finland showed monday
4,tourism public spending continued boost economy january light contraction private consumption export according bank thailand data


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer  
tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))  
X = tfidfconverter.fit_transform(title['result']).toarray()
X.shape

(55932, 2000)

In [10]:
X1 = tfidfconverter.fit_transform(headline['result']).toarray()
X1.shape

(55932, 2000)

In [12]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [13]:
train_tf = X[:55932,:]
test_tf = X[55932:,:]
# splitting data into training and validation set
xtrain_tf, xvalid_tf, ytrain, yvalid = train_test_split(train_tf, train['SentimentTitle'],
                                                          random_state=42,
                                                          test_size=0.2)

In [14]:
train_tf1 = X1[:55932,:]
test_tf1 = X1[55932:,:]
# splitting data into training and validation set
xtrain_tf1, xvalid_tf1, ytrain1, yvalid1 = train_test_split(train_tf1, train['SentimentHeadline'],
                                                          random_state=42,
                                                          test_size=0.2)

In [16]:
model = xgb.XGBRegressor()
xgb_model = model.fit(xtrain_tf, ytrain)
prediction = xgb_model.predict(xvalid_tf)

In [17]:
xgb_model1 = model.fit(xtrain_tf1, ytrain1)
prediction1 = xgb_model1.predict(xvalid_tf1)

In [18]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(yvalid, prediction))
print('Mean Squared Error:', metrics.mean_squared_error(yvalid, prediction))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(yvalid, prediction)))

Mean Absolute Error: 0.08584040887421067
Mean Squared Error: 0.014278428574043607
Root Mean Squared Error: 0.11949237872786535


In [19]:
print('Mean Absolute Error:', metrics.mean_absolute_error(yvalid1, prediction1))
print('Mean Squared Error:', metrics.mean_squared_error(yvalid1, prediction1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(yvalid1, prediction1)))

Mean Absolute Error: 0.09840962721332887
Mean Squared Error: 0.016396861444472336
Root Mean Squared Error: 0.1280502301617312
