In [None]:
import pandas as panda
import numpy as num
from scipy.sparse import coo_matrix, vstack, hstack

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [None]:
#read preprocessed data from file
train_data = panda.read_csv("train_preprocessed.csv", index_col='id')
print(train_data.shape)
train_data.head()

(256442, 5)


Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
195611,0,1,two new oldage insurance benefit old people ru...,police disprove bird nest congress person get ...,unrelated
191474,2,3,if come shenzhen sooner later son also come le...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
25300,2,4,if come shenzhen sooner later son also come le...,gdp overtopped hong kong shenzhen clarified li...,unrelated
123757,2,8,if come shenzhen sooner later son also come le...,shenzhens gdp overtakes hong kong bureau stati...,unrelated
141761,2,11,if come shenzhen sooner later son also come le...,shenzhens gdp outpaces hong kong defending rum...,unrelated


In [None]:
#dropping null values from file
train_data = train_data.dropna()
print(train_data.shape)
train_data.head()

(256408, 5)


Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
195611,0,1,two new oldage insurance benefit old people ru...,police disprove bird nest congress person get ...,unrelated
191474,2,3,if come shenzhen sooner later son also come le...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
25300,2,4,if come shenzhen sooner later son also come le...,gdp overtopped hong kong shenzhen clarified li...,unrelated
123757,2,8,if come shenzhen sooner later son also come le...,shenzhens gdp overtakes hong kong bureau stati...,unrelated
141761,2,11,if come shenzhen sooner later son also come le...,shenzhens gdp outpaces hong kong defending rum...,unrelated


In [None]:
#vectorizing using TF-IDF
title1_vector = TfidfVectorizer(analyzer='word',stop_words= 'english').fit(train_data['title1_en'])
title1_tfidf_vector = title1_vector.transform(train_data['title1_en'])

In [None]:
title1_tfidf_vector.shape

(256408, 28629)

In [None]:
#vectorizing title2 column seperately
title2_tfidf_vector = title1_vector.transform(train_data['title2_en'])

In [None]:
#stacking to matrix in order to increase columns
title_stack = hstack([title1_tfidf_vector, title2_tfidf_vector])
title_stack.shape

In [None]:
#splitting data into train and test for input in model training
x_train, x_test, y_train, y_test = train_test_split(title_stack, train_data['label'], test_size=0.2)
#training model
model = LogisticRegression(max_iter=500).fit(x_train, y_train)

In [None]:
#getting accuracy
accuracy_score = model.score(x_test, y_test)
accuracy_score

0.8023478023478023

In [None]:
#printing metrics report
from sklearn import metrics
y_prediction = model.predict(x_test)
print(metrics.classification_report(list(y_test), list(y_prediction)))

              precision    recall  f1-score   support

      agreed       0.72      0.64      0.68     14918
   disagreed       0.79      0.26      0.39      1382
   unrelated       0.83      0.89      0.86     34982

    accuracy                           0.80     51282
   macro avg       0.78      0.60      0.64     51282
weighted avg       0.80      0.80      0.79     51282



In [None]:
#now applying trained model to predict labels in test data
test_data = panda.read_csv("test_preprocessed.csv", index_col='id')
print(test_data.shape)
test_data.head()

(64110, 4)


Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
256442,100672,100673,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milking huo jianhua se...
256443,162269,162270,nasa reveals fact ufo wreckage found moon,ufo found yuancun jiaocheng county shanxi shoc...
256444,157826,157854,hollow tomato loaded hormone,li chenfan bingbing home photo netizen called ...
256445,109579,74076,ange pavilion geoshui accurate matrimony match...,master one eightcharacter presumption marriage...
256446,15068,15085,50yearold busbus blow 8yearold child rumor rum...,joe johnson disgruntled timing order myth


In [None]:
#dropping null values from test data
test_data = test_data.dropna()
print(test_data.shape)
test_data.head()

(64103, 4)


Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
256442,100672,100673,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milking huo jianhua se...
256443,162269,162270,nasa reveals fact ufo wreckage found moon,ufo found yuancun jiaocheng county shanxi shoc...
256444,157826,157854,hollow tomato loaded hormone,li chenfan bingbing home photo netizen called ...
256445,109579,74076,ange pavilion geoshui accurate matrimony match...,master one eightcharacter presumption marriage...
256446,15068,15085,50yearold busbus blow 8yearold child rumor rum...,joe johnson disgruntled timing order myth


In [None]:
#vectorizing columns in test data
test_title1_tfidf_vector = title1_vector.transform(test_data['title1_en'])
test_title2_tfidf_vector = title1_vector.transform(test_data['title2_en'])
#stacking to form matric with incresed columns
test_title_stack = hstack([test_title1_tfidf_vector, test_title2_tfidf_vector])

In [None]:
#prediciting labels with the trained model
test_predict_data_labels = model.predict(test_title_stack)

In [None]:
#adding labels to test data
test_data['label'] = test_predict_data_labels

In [None]:
test_data.head()

Unnamed: 0_level_0,tid1,tid2,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
256442,100672,100673,great coat brother zhu zhu wen mandarin love s...,lin xinsheng birth hard milking huo jianhua se...,unrelated
256443,162269,162270,nasa reveals fact ufo wreckage found moon,ufo found yuancun jiaocheng county shanxi shoc...,unrelated
256444,157826,157854,hollow tomato loaded hormone,li chenfan bingbing home photo netizen called ...,unrelated
256445,109579,74076,ange pavilion geoshui accurate matrimony match...,master one eightcharacter presumption marriage...,unrelated
256446,15068,15085,50yearold busbus blow 8yearold child rumor rum...,joe johnson disgruntled timing order myth,unrelated


In [None]:
#extracting label column 
test_label_column = test_data['label']
print(test_label_column)

id
256442    unrelated
256443    unrelated
256444    unrelated
256445    unrelated
256446    unrelated
            ...    
320547    unrelated
320548    unrelated
320549       agreed
320550    unrelated
320551       agreed
Name: label, Length: 64103, dtype: object


In [None]:
#saving output file
from google.colab import files
test_label_column.to_csv('test_logistic_Reg_predicted.csv')
files.download('test_logistic_Reg_predicted.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>