In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, vstack, hstack

from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
#read preprocessed data from file
train_preprocessed_data = pd.read_csv("train_preprocessed.csv", index_col='id')
print(train_preprocessed_data.shape)
train_preprocessed_data.head()

(256442, 5)


Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
195611,0,1,two new oldage insurance benefit old people ru...,police disprove bird nest congress person get ...,unrelated
191474,2,3,if come shenzhen sooner later son also come le...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
25300,2,4,if come shenzhen sooner later son also come le...,gdp overtopped hong kong shenzhen clarified li...,unrelated
123757,2,8,if come shenzhen sooner later son also come le...,shenzhens gdp overtakes hong kong bureau stati...,unrelated
141761,2,11,if come shenzhen sooner later son also come le...,shenzhens gdp outpaces hong kong defending rum...,unrelated


In [None]:
#dropping null values from file
train_preprocessed_data = train_preprocessed_data.dropna()
print(train_preprocessed_data.shape)
train_preprocessed_data.head()

(256442, 5)


Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
195611,0,1,two new oldage insurance benefit old people ru...,police disprove bird nest congress person get ...,unrelated
191474,2,3,if come shenzhen sooner later son also come le...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
25300,2,4,if come shenzhen sooner later son also come le...,gdp overtopped hong kong shenzhen clarified li...,unrelated
123757,2,8,if come shenzhen sooner later son also come le...,shenzhens gdp overtakes hong kong bureau stati...,unrelated
141761,2,11,if come shenzhen sooner later son also come le...,shenzhens gdp outpaces hong kong defending rum...,unrelated


In [None]:
#vectorizing using TF-IDF
title1 = TfidfVectorizer(analyzer='word',stop_words= 'english').fit(train['title1_en'])
title1_tfidf = title1.transform(train['title1_en'])
print(title1_tfidf)

  (0, 20900)	0.2782352178906178
  (0, 18065)	0.2349668239046241
  (0, 17283)	0.4235321734399695
  (0, 17282)	0.27443372895480683
  (0, 16770)	0.23601713070096275
  (0, 12700)	0.3898510913885437
  (0, 10478)	0.373422075809077
  (0, 3381)	0.3877096476837602
  (0, 2533)	0.33945780976865075
  (1, 27537)	0.12741050654052524
  (1, 22794)	0.2968191274763247
  (1, 22764)	0.1925839097251328
  (1, 21870)	0.46620296071201517
  (1, 14167)	0.21134696983193052
  (1, 14118)	0.2328751128397332
  (1, 13861)	0.23420945735158127
  (1, 11674)	0.22360710910155301
  (1, 10202)	0.28240415966089333
  (1, 8685)	0.31930827655535976
  (1, 5567)	0.3610382809412552
  (1, 4452)	0.3116960780489795
  (1, 54)	0.16812396107639768
  (2, 27537)	0.12741050654052524
  (2, 22794)	0.2968191274763247
  (2, 22764)	0.1925839097251328
  :	:
  (256404, 9005)	0.2280108244166669
  (256404, 8213)	0.47477863679490706
  (256404, 8171)	0.5377702430798319
  (256405, 26722)	0.20347036750933598
  (256405, 26179)	0.22381181485708523
  (256

In [None]:
title1_tfidf.shape

(256408, 28629)

In [None]:
#vectorizing title2 column seperately
title2_tfidf = title1.transform(train['title2_en'])
print(title2_tfidf)

  (0, 27819)	0.19503095754681346
  (0, 18534)	0.20491970062617476
  (0, 18125)	0.23574507138668405
  (0, 18065)	0.1614604345105876
  (0, 17282)	0.1885806191064233
  (0, 16745)	0.3452363330389005
  (0, 12653)	0.3662588612942864
  (0, 10425)	0.22798452608809133
  (0, 5813)	0.4898314313647377
  (0, 3592)	0.3077586297142306
  (0, 3329)	0.24894303462908512
  (0, 955)	0.31753664091965406
  (1, 23181)	0.3466168269953833
  (1, 21872)	0.3412236590200946
  (1, 21870)	0.24434078038849966
  (1, 20860)	0.16829786269503835
  (1, 16600)	0.41152848417928184
  (1, 13861)	0.2455021799786589
  (1, 11674)	0.23438862530965363
  (1, 10202)	0.2960206543996011
  (1, 10155)	0.33982119355534657
  (1, 7449)	0.3607021689666378
  (1, 4236)	0.2400392358813575
  (2, 21870)	0.33606848230962055
  (2, 14663)	0.25654730383217644
  :	:
  (256403, 2675)	0.4347006804792253
  (256404, 23607)	0.28232990993598006
  (256404, 14663)	0.25214487902161603
  (256404, 10278)	0.45666122463234426
  (256404, 8113)	0.5302527084976909
  

In [None]:
title2_tfidf.shape

(256408, 28629)

In [None]:
#stacking to matrix in order to increase columns
title_stack = hstack([title1_tfidf, title2_tfidf])
print(title_stack)

  (0, 20900)	0.2782352178906178
  (0, 18065)	0.2349668239046241
  (0, 17283)	0.4235321734399695
  (0, 17282)	0.27443372895480683
  (0, 16770)	0.23601713070096275
  (0, 12700)	0.3898510913885437
  (0, 10478)	0.373422075809077
  (0, 3381)	0.3877096476837602
  (0, 2533)	0.33945780976865075
  (0, 56448)	0.19503095754681346
  (0, 47163)	0.20491970062617476
  (0, 46754)	0.23574507138668405
  (0, 46694)	0.1614604345105876
  (0, 45911)	0.1885806191064233
  (0, 45374)	0.3452363330389005
  (0, 41282)	0.3662588612942864
  (0, 39054)	0.22798452608809133
  (0, 34442)	0.4898314313647377
  (0, 32221)	0.3077586297142306
  (0, 31958)	0.24894303462908512
  (0, 29584)	0.31753664091965406
  (1, 27537)	0.12741050654052524
  (1, 22794)	0.2968191274763247
  (1, 22764)	0.1925839097251328
  (1, 21870)	0.46620296071201517
  :	:
  (256406, 26722)	0.20347036750933598
  (256406, 26179)	0.22381181485708523
  (256406, 18884)	0.5702581595042489
  (256406, 15914)	0.12927949661365634
  (256406, 9005)	0.2280108244166669

In [None]:
#splitting data into train and test for input in model training
print(title_stack.shape)
x_train, x_test, y_train, y_test = train_test_split(title_stack, train['label'], test_size=0.2)

(256408, 57258)


In [None]:
#training model
model = LinearSVC().fit(x_train, y_train)

In [None]:
#getting accuracy
score = model.score(x_test, y_test)
score

0.8111033111033111

In [None]:
y_pred = model.predict(x_test)
score_f1 = f1_score(y_pred, y_test, average=None)
score_f1

array([0.7011205 , 0.46021287, 0.8653368 ])

In [None]:
#printing metrics report
from sklearn import metrics
print(metrics.classification_report(list(y_test), list(y_pred)))

              precision    recall  f1-score   support

      agreed       0.72      0.68      0.70     14864
   disagreed       0.70      0.34      0.46      1320
   unrelated       0.85      0.88      0.87     35098

    accuracy                           0.81     51282
   macro avg       0.75      0.64      0.68     51282
weighted avg       0.81      0.81      0.81     51282



In [None]:
#now applying trained model to predict labels in test data
test_preprocessed_data = pd.read_csv("test_preprocessed.csv", index_col='id')
print(test_preprocessed_data.shape)
test_preprocessed_data.head()

(64110, 4)


Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
256442,100672,100673,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milking huo jianhua se...
256443,162269,162270,nasa reveals fact ufo wreckage found moon,ufo found yuancun jiaocheng county shanxi shoc...
256444,157826,157854,hollow tomato loaded hormone,li chenfan bingbing home photo netizen called ...
256445,109579,74076,ange pavilion geoshui accurate matrimony match...,master one eightcharacter presumption marriage...
256446,15068,15085,50yearold busbus blow 8yearold child rumor rum...,joe johnson disgruntled timing order myth


In [None]:
#dropping null values from test data
test_preprocessed_data = test_preprocessed_data.dropna()
print(test_preprocessed_data.shape)
test_preprocessed_data.head()

(64103, 4)


Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
256442,100672,100673,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milking huo jianhua se...
256443,162269,162270,nasa reveals fact ufo wreckage found moon,ufo found yuancun jiaocheng county shanxi shoc...
256444,157826,157854,hollow tomato loaded hormone,li chenfan bingbing home photo netizen called ...
256445,109579,74076,ange pavilion geoshui accurate matrimony match...,master one eightcharacter presumption marriage...
256446,15068,15085,50yearold busbus blow 8yearold child rumor rum...,joe johnson disgruntled timing order myth


In [None]:
#vectorizing columns in test data
test_title1_tfidf = title1.transform(test_preprocessed_data['title1_en'])
test_title2_tfidf = title1.transform(test_preprocessed_data['title2_en'])

#stacking to form matric with incresed columns
test_title_stack = hstack([test_title1_tfidf, test_title2_tfidf])
#prediciting labels with the trained model
test_predict = model.predict(test_title_stack)

In [None]:
#adding labels to test data
test_preprocessed_data['label'] = test_predict
test_preprocessed_data.head()

Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
256442,100672,100673,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milking huo jianhua se...,unrelated
256443,162269,162270,nasa reveals fact ufo wreckage found moon,ufo found yuancun jiaocheng county shanxi shoc...,unrelated
256444,157826,157854,hollow tomato loaded hormone,li chenfan bingbing home photo netizen called ...,unrelated
256445,109579,74076,ange pavilion geoshui accurate matrimony match...,master one eightcharacter presumption marriage...,unrelated
256446,15068,15085,50yearold busbus blow 8yearold child rumor rum...,joe johnson disgruntled timing order myth,unrelated


In [None]:
#extracting label column 
test_labels = test_preprocessed_data['label']
test_labels.head()

id
256442    unrelated
256443    unrelated
256444    unrelated
256445    unrelated
256446    unrelated
Name: label, dtype: object

In [None]:
#saving output file
from google.colab import files
test_labels.to_csv('test_linearSVM_predicted.csv')
files.download('test_linearSVM_predicted.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>