In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
train_data = pd.read_csv('train.tsv', delimiter='\t', header=None)
test_data = pd.read_csv('test.tsv', delimiter='\t', header=None)
valid_data = pd.read_csv('valid.tsv', delimiter='\t', header=None)

In [3]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report"""
10236,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview
10237,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate
10238,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...


In [4]:
column_names = ['pic','Truth Value', 'Statement', 'Subjects', 'Person', 'Title', 'State', 'Party', 'Score1', 'Score2', 'Score3', 'Score4', 'Score5', 'Source']
train_data.columns = column_names
test_data.columns = column_names
valid_data.columns = column_names


In [5]:
train_data

Unnamed: 0,pic,Truth Value,Statement,Subjects,Person,Title,State,Party,Score1,Score2,Score3,Score4,Score5,Source
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report"""
10236,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview
10237,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate
10238,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...


In [6]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)
valid_data.dropna(inplace=True)

In [7]:
train_data

Unnamed: 0,pic,Truth Value,Statement,Subjects,Person,Title,State,Party,Score1,Score2,Score3,Score4,Score5,Source
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
5,12465.json,true,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece
7,153.json,half-true,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10227,13344.json,pants-fire,"Recently though, the media has reported on tho...",elections,john-rafferty,State Senator,Pennsylvania,republican,0.0,0.0,0.0,0.0,1.0,a debate.
10228,13239.json,barely-true,Stopped by Smiley Cookie to pick up some great...,food,donald-trump,President-Elect,New York,republican,63.0,114.0,51.0,37.0,61.0,a Facebook post.
10230,11018.json,barely-true,The Supreme Courts views are radically out of ...,"gays-and-lesbians,polls,supreme-court",ted-cruz,Senator,Texas,republican,36.0,33.0,15.0,19.0,8.0,an interview on NPR
10231,2930.json,half-true,"When it comes to the state deficit, Wisconsin ...",state-budget,alberta-darling,"State Senator, 8th District",Wisconsin,republican,1.0,1.0,2.0,1.0,1.0,a television interview


In [8]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['Person'] + ' ' + train_data['Party'] + ' ' + train_data['Source'] + ' ' + train_data['Statement'])
X_test = vectorizer.transform(test_data['Person'] + ' ' + test_data['Party'] + ' ' + test_data['Source'] + ' ' + test_data['Statement'])
X_valid = vectorizer.transform(valid_data['Person'] + ' ' + valid_data['Party'] + ' ' + valid_data['Source'] + ' ' + valid_data['Statement'])

In [9]:
y_train = train_data['Truth Value']
y_test = test_data['Truth Value']
y_valid = valid_data['Truth Value']

In [10]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [11]:
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.26260257913247365
Classification Report:
              precision    recall  f1-score   support

 barely-true       0.23      0.14      0.17       139
       false       0.28      0.26      0.27       167
   half-true       0.24      0.36      0.29       182
 mostly-true       0.25      0.35      0.29       173
  pants-fire       0.00      0.00      0.00        43
        true       0.36      0.24      0.29       149

    accuracy                           0.26       853
   macro avg       0.23      0.22      0.22       853
weighted avg       0.26      0.26      0.25       853



In [12]:
vectorizer = CountVectorizer()
X_train2 = vectorizer.fit_transform(train_data['Person'] + ' ' + train_data['Party'] + ' ' + train_data['Source'] + ' ' + train_data['Subjects'])
X_test2 = vectorizer.transform(test_data['Person'] + ' ' + test_data['Party'] + ' ' + test_data['Source'] + ' ' + test_data['Subjects'])
X_valid2 = vectorizer.transform(valid_data['Person'] + ' ' + valid_data['Party'] + ' ' + valid_data['Source'] + ' ' + valid_data['Subjects'])

In [13]:
classifier2 = MultinomialNB()
classifier2.fit(X_train2, y_train)

MultinomialNB()

In [14]:
y_pred = classifier2.predict(X_test2)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.2555685814771395
Classification Report:
              precision    recall  f1-score   support

 barely-true       0.23      0.14      0.17       139
       false       0.26      0.29      0.27       167
   half-true       0.25      0.33      0.28       182
 mostly-true       0.25      0.29      0.27       173
  pants-fire       0.20      0.02      0.04        43
        true       0.29      0.26      0.27       149

    accuracy                           0.26       853
   macro avg       0.25      0.22      0.22       853
weighted avg       0.25      0.26      0.25       853



In [15]:
# import sys
# !pip install textblob

Defaulting to user installation because normal site-packages is not writeable


In [16]:
# import sys
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import classification_report, accuracy_score
# from textblob import Textblob

ImportError: cannot import name 'Textblob' from 'textblob' (/home/lgeddam/.local/lib/python3.9/site-packages/textblob/__init__.py)