In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read both train and test dataset
train=pd.read_csv("train_file.csv")
test=pd.read_csv("test_file.csv")

In [3]:
train.head()

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
0,Tr3CMgRv1N,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,USA TODAY,obama,2002-04-02 00:00:00,-1,-1,-1,0.0,-0.0533
1,Wc81vGp8qZ,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",Bloomberg,economy,2008-09-20 00:00:00,-1,-1,-1,0.208333,-0.156386
2,zNGH03CrZH,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Bloomberg,economy,2012-01-28 00:00:00,-1,-1,-1,-0.42521,0.139754
3,3sM1H0W8ts,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,RTT News,economy,2015-03-01 00:06:00,-1,-1,-1,0.0,0.026064
4,wUbnxgvqaZ,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,The Nation - Thailand&#39;s English news,economy,2015-03-01 00:11:00,-1,-1,-1,0.0,0.141084


In [4]:
train.columns

Index(['IDLink', 'Title', 'Headline', 'Source', 'Topic', 'PublishDate',
       'Facebook', 'GooglePlus', 'LinkedIn', 'SentimentTitle',
       'SentimentHeadline'],
      dtype='object')

In [5]:
train.describe()

Unnamed: 0,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
count,55932.0,55932.0,55932.0,55932.0,55932.0
mean,132.050329,4.551616,14.300132,-0.006318,-0.029577
std,722.931314,21.137177,76.65142,0.137569,0.143038
min,-1.0,-1.0,-1.0,-0.838525,-0.755355
25%,0.0,0.0,0.0,-0.079057,-0.116927
50%,6.0,0.0,0.0,0.0,-0.027277
75%,37.0,2.0,4.0,0.063969,0.057354
max,49211.0,1267.0,3716.0,0.962354,0.964646


In [6]:
#Checked null values
train.isnull().sum()

IDLink                 0
Title                  0
Headline               0
Source               175
Topic                  0
PublishDate            0
Facebook               0
GooglePlus             0
LinkedIn               0
SentimentTitle         0
SentimentHeadline      0
dtype: int64

In [7]:
test.isnull().sum()

IDLink           0
Title            0
Headline         0
Source         101
Topic            0
PublishDate      0
Facebook         0
GooglePlus       0
LinkedIn         0
dtype: int64

In [8]:
train["Source"].unique()

array(['USA TODAY', 'Bloomberg', 'RTT News', ..., 'KERA News',
       'KEVN Black Hills Fox', 'SHRM'], dtype=object)

In [9]:
#Checked which source is repeated most of the time found Bloomberg is repeated most of the time so replaced all null values with Bloomberg
train["Source"].value_counts()

Bloomberg                     992
Reuters                       763
ABC News                      645
New York Times                573
The Guardian                  551
                             ... 
Catholic New York               1
Greater Greater Washington      1
Dorset Echo                     1
The Wayne Independent           1
Animation World Network         1
Name: Source, Length: 4753, dtype: int64

In [10]:
train["Topic"].value_counts()

economy      20486
obama        16917
microsoft    12911
palestine     5618
Name: Topic, dtype: int64

In [11]:
test["Source"].value_counts()

Bloomberg                     740
Reuters                       558
ABC News                      453
New York Times                419
MSPoweruser.com               416
                             ... 
New Europe                      1
The Australian (blog)           1
National Catholic Register      1
Bwog                            1
EverythingLubbock.com           1
Name: Source, Length: 3666, dtype: int64

In [12]:
#filled null values with Bloomberg
train["Source"]=train["Source"].fillna("Bloomberg")
test["Source"]=test["Source"].fillna("Bloomberg")

In [13]:
train.isnull().sum()

IDLink               0
Title                0
Headline             0
Source               0
Topic                0
PublishDate          0
Facebook             0
GooglePlus           0
LinkedIn             0
SentimentTitle       0
SentimentHeadline    0
dtype: int64

In [14]:
test.isnull().sum()

IDLink         0
Title          0
Headline       0
Source         0
Topic          0
PublishDate    0
Facebook       0
GooglePlus     0
LinkedIn       0
dtype: int64

In [15]:
#import nltk,re,stopwords for text preprocessing and cleaning
import nltk
import re
from nltk.corpus import stopwords



In [16]:
#made fuction for cleaning of data 
stop=set(stopwords.words('english'))
def clean(text):
  text_token = nltk.word_tokenize(text)
  filtered_text = ' '.join([w.lower() for w in text_token if w.lower() not in stop])
  filtered_text = filtered_text.replace(r"[^a-zA-Z]+", '')
  text_only = re.sub(r'\b\d+\b', '', filtered_text)
  clean_text = text_only.replace(',', '').replace('.', '').replace(':', '')
  return clean_text

In [17]:
#Combined Title+Source+Topic for Title
#combined Headline+Source+Topic 
#as all columns have text data so we can easily preprocess it
train['Text_Title'] = train['Title'] + ' ' + train['Source'] + ' ' + train['Topic']
test['Text_Title'] = test['Title'] + ' ' + test['Source'] + ' ' + test['Topic']

train['Text_Headline'] = train['Headline'] + ' ' + train['Source'] + ' ' + train['Topic']
test['Text_Headline'] = test['Headline'] + ' ' + test['Source'] + ' ' + test['Topic']

In [18]:
train['Text_Title'][355]

"Microsoft's Edge browser can beam videos to your TV, Chromecast ... The Next Web microsoft"

In [19]:
#apply clean fuction to both Text_Title & Text_Headline
train['Text_Title'] = [clean(x) for x in train['Text_Title']]
test['Text_Title'] = [clean(x) for x in test['Text_Title']]

train['Text_Headline'] = [clean(x) for x in train['Text_Headline']]
test['Text_Headline'] = [clean(x) for x in test['Text_Headline']]

In [20]:
train['Text_Title'][355]

"microsoft 's edge browser beam videos tv  chromecast  next web microsoft"

In [21]:
#apply TF-idf for converting text into vectors
from sklearn.feature_extraction.text import TfidfVectorizer


In [22]:
#convert text into sparse matrix
vectorizer = TfidfVectorizer(use_idf=True)

train_v_Title = vectorizer.fit_transform(train['Text_Title'])
test_v_Title = vectorizer.transform(test['Text_Title'])

vectorizer_ = TfidfVectorizer()

train_v_Headline = vectorizer_.fit_transform(train['Text_Headline'])
test_v_Headline = vectorizer_.transform(test['Text_Headline'])

In [23]:
train_v_Title 

<55932x25612 sparse matrix of type '<class 'numpy.float64'>'
	with 496342 stored elements in Compressed Sparse Row format>

In [24]:
train_v_Headline

<55932x39031 sparse matrix of type '<class 'numpy.float64'>'
	with 1005073 stored elements in Compressed Sparse Row format>

In [25]:
print(np.shape(train_v_Title))

(55932, 25612)


In [26]:
#apply train test split
from sklearn.model_selection import train_test_split

In [27]:
#apply SVM model to train_v_Title and SentimentTitle
X_train, X_test, y_train, y_test = train_test_split(train_v_Title , train["SentimentTitle"], test_size=0.20, random_state=42)
from sklearn.svm import LinearSVR
model1 = LinearSVR(C=0.2)
model1.fit(X_train, y_train)

LinearSVR(C=0.2, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
          random_state=None, tol=0.0001, verbose=0)

In [28]:
X_train.shape

(44745, 25612)

In [29]:
X_test.shape

(11187, 25612)

In [30]:
#import Mean_absolute_error
from sklearn.metrics import mean_absolute_error

In [31]:
#check y_pred for X_test
y_pred1 = model1.predict(X_test)
#Check Mean absolute error for predicted value and test value
mae1 = mean_absolute_error(y_pred1, y_test)
print('MAE:', 1 - mae1)

MAE: 0.9434130828276288


In [32]:
#done same things for SentimentHeadline which done for SentimentTitle
X_train, X_test, y_train, y_test = train_test_split(train_v_Headline, train["SentimentHeadline"], test_size=0.20, random_state=42)

model2 = LinearSVR(C=0.1)
model2.fit(X_train, y_train)

y_pred2 = model2.predict(X_test)
mae2 = mean_absolute_error(y_pred2, y_test)
print('MAE:', 1 - mae2)

MAE: 0.929200849501443


In [33]:

print('MAE:', 1 - ((0.4 * mae1) + (0.6 * mae2)))

MAE: 0.9348857428319173


In [34]:
#predict values for test_v_title & test_v_Headline and store it into Title & Headline
title = model1.predict(test_v_Title)
headline = model2.predict(test_v_Headline)

In [35]:
title.shape

(37288,)

In [36]:
headline.shape

(37288,)

In [37]:
#store IDLink in test_id
test_id = test['IDLink']

In [38]:
#create dataframe of predicted values by combining test_id,title,headline
df = pd.DataFrame()
df['IDLink'] = test_id
df['SentimentTitle'] = title
df['SentimentHeadline'] = headline
df

Unnamed: 0,IDLink,SentimentTitle,SentimentHeadline
0,tFrqIR6Chj,0.133970,-0.044043
1,DVAaGErjlF,-0.142431,0.038349
2,OT9UIZm5M2,0.044144,-0.119156
3,lflGp3q2Fj,-0.116369,-0.151514
4,zDYG0SoovZ,-0.080813,0.096306
...,...,...,...
37283,5bYaKaEyN3,0.100222,-0.033164
37284,jmnC32Uh5u,0.008849,-0.001302
37285,yPhgBwobV0,0.239637,-0.068009
37286,adFcODOaiQ,0.043663,0.017596


In [39]:
#convert it into csv file
df.to_csv("C:\\Users\\KAMLESH\\Submission.csv" ,encoding="ISO-8859-1")