In [316]:
import pandas as pd
import os.path
import pickle
import numpy as np
import keras.utils
import time
from keras.callbacks import TensorBoard, CSVLogger
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import optimizers
from keras.layers import Input
from keras.models import Model
from keras.utils.vis_utils import plot_model
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.models import load_model
from nltk.corpus import stopwords
import operator
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from textblob import TextBlob
from wordcloud import WordCloud
nltk.download('stopwords')
tokenizer=RegexpTokenizer(r'\w+')
ps=PorterStemmer()
en_stop=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Jazi
[nltk_data]     Designs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [317]:
from tensorflow.keras.utils import to_categorical

In [318]:
df_fake = pd.read_excel('train.xlsx')
df_fake.head()

Unnamed: 0,2635.json,False,Says the Annies List political group supports third-trimester abortions on demand.,abortion,dwayne-bohac,State representative,Texas,republican,0.1,1,0.2,0.3,0.4,a mailer
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
2,1123.json,False,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
3,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN
4,12465.json,True,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,republican,0,3,2,5,1,a an online opinion-piece


In [319]:
train_data = df_fake.apply(lambda x: x.astype(str).str.lower())
train_data.shape

(10268, 14)

In [320]:
feature_cols =["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"]
train_data.columns=feature_cols
train_data.columns

Index(['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state',
       'party', 'barely-true', 'false', 'half-true', 'mostly-true',
       'pants-fire', 'venue'],
      dtype='object')

In [321]:
train_data.isnull().sum()

id             0
label          0
statement      0
subject        0
speaker        0
job            0
state          0
party          0
barely-true    0
false          0
half-true      0
mostly-true    0
pants-fire     0
venue          0
dtype: int64

In [322]:
print(train_data.label.unique())

['half-true' 'mostly-true' 'false' 'true' 'barely-true' 'pants-fire']


In [323]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10268 entries, 0 to 10267
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           10268 non-null  object
 1   label        10268 non-null  object
 2   statement    10268 non-null  object
 3   subject      10268 non-null  object
 4   speaker      10268 non-null  object
 5   job          10268 non-null  object
 6   state        10268 non-null  object
 7   party        10268 non-null  object
 8   barely-true  10268 non-null  object
 9   false        10268 non-null  object
 10  half-true    10268 non-null  object
 11  mostly-true  10268 non-null  object
 12  pants-fire   10268 non-null  object
 13  venue        10268 non-null  object
dtypes: object(14)
memory usage: 1.1+ MB


## Label column

In [324]:
y_label_dict = {"pants-fire" : 0, "false" : 1, "barely-true" : 2, "half-true" : 3, "mostly-true" : 4, "true" : 5}
train_data['output'] = train_data['label'].apply(lambda x: y_label_dict[x])

In [325]:
train_data['output'].value_counts()

3    2123
1    1997
4    1966
5    1683
2    1657
0     842
Name: output, dtype: int64

## Speaker

In [326]:
frequent_speakers = {'barack-obama' : 0, 'donald-trump' : 1, 'hillary-clinton' : 2, 
                     'mitt-romney' : 3, 'scott-walker' : 4, 'john-mccain' : 5, 
                     'rick-perry' : 6, 'chain-email' : 7, 'marco-rubio' : 8, 'viral-image':13,
                     'rick-scott' : 9, 'ted-cruz' : 10, 'bernie-s' : 11, 'newt-gingrich':16,
                     'chris-christie' : 12, 'facebook-posts' : 13,'blog-posting':13, 
                     'charlie-crist' : 14, 'congressional' : 15, 'republican' : 15, 
                     'national-committe' : 15, 'democratic':15}

frequent_speakers

{'barack-obama': 0,
 'donald-trump': 1,
 'hillary-clinton': 2,
 'mitt-romney': 3,
 'scott-walker': 4,
 'john-mccain': 5,
 'rick-perry': 6,
 'chain-email': 7,
 'marco-rubio': 8,
 'viral-image': 13,
 'rick-scott': 9,
 'ted-cruz': 10,
 'bernie-s': 11,
 'newt-gingrich': 16,
 'chris-christie': 12,
 'facebook-posts': 13,
 'blog-posting': 13,
 'charlie-crist': 14,
 'congressional': 15,
 'republican': 15,
 'national-committe': 15,
 'democratic': 15}

In [327]:
def get_speaker_id(speaker):
    if isinstance(speaker, str):
        matched = [sp for sp in frequent_speakers if sp in speaker.lower()]
        if len(matched)>0:
            return frequent_speakers[matched[0]]
        else:
            return len(set(frequent_speakers.values())) 
    else:
        return len(set(frequent_speakers.values()))
    
train_data['speaker_id'] = train_data['speaker'].apply(get_speaker_id)
train_data['speaker_id'].value_counts()

17    7363
0      493
15     347
1      276
2      239
3      180
13     156
4      150
5      148
6      142
7      142
8      117
9      116
10      93
11      89
12      78
14      70
16      69
Name: speaker_id, dtype: int64

## Job

In [328]:
frequent_jobs = { 'senator' : 0, 'president' : 1, 'governor' : 2, 
                 'u.s. representative' : 3, 'attorney' : 4, 'congressman' : 5, 
                 'congresswoman' : 5, 'social media posting' : 6, 'lawyer' : 4, 
                 'businessman' : 6,  'radio host' : 8, 'host':8,
                  'mayor' : 7, 'assembly' : 9,'representative' : 3, 
                 'senate' : 9,'state representative' : 10,'milwaukee county executive' : 11,
                 'u.s. house of representatives' : 3,'house representative' : 3,
                 'house of representatives' : 3,'house member':3}


def get_job_id(job):
    if isinstance(job, str):
        matched = [jb for jb in frequent_jobs if jb in job.lower() ]
        if len(matched)>0:
            return frequent_jobs[matched[0]]
        else:
            return len(set(frequent_jobs.values()))
    else:
        return len(set(frequent_jobs.values()))
    
    
train_data['job_id'] = train_data['job'].apply(get_job_id)
train_data['job_id'].value_counts()

12    4608
1     1219
0     1203
3      912
2      896
8      279
9      254
5      232
4      224
7      167
11     150
6      124
Name: job_id, dtype: int64

# Party

In [329]:
frequent_parties = train_data['party'].str.lower().value_counts()[:5].reset_index().to_dict()['index']
frequent_parties = dict((v,k) for k,v in frequent_parties.items())

def get_party_id(party):
    if isinstance(party, str):
        matched = [pt for pt in frequent_parties if pt in party.lower() ]
        if len(matched)>0:
            return frequent_parties[matched[0]]
        else:
            return len(set(frequent_parties.values())) 
    else:
        return len(set(frequent_parties.values()))
    
train_data['party_id'] = train_data['party'].apply(get_party_id)
print(frequent_parties)
train_data['party_id'].value_counts()

{'republican': 0, 'democrat': 1, 'none': 2, 'organization': 3, 'independent': 4}


0    4509
1    3346
2    1746
5     298
3     220
4     149
Name: party_id, dtype: int64

In [330]:
train_data.loc[train_data['party_id']==9]['party'].value_counts()[:90]

Series([], Name: party, dtype: int64)

## States

In [331]:
other_states = ['wyoming', 'colorado', 'hawaii', 'tennessee', 'nevada', 'maine',
                'north dakota', 'mississippi', 'south dakota', 'oklahoma', 
                'delaware', 'minnesota', 'north carolina', 'arkansas', 'indiana', 
                'maryland', 'louisiana', 'idaho', 'iowa', 'west virginia', 
                'michigan', 'kansas', 'utah', 'connecticut', 'montana', 'vermont', 
                'pennsylvania', 'alaska', 'kentucky', 'nebraska', 'new hampshire', 
                'missouri', 'south carolina', 'alabama', 'new mexico']
frequent_states = {'texas': 1, 'florida': 2, 'wisconsin': 3, 'new york': 4, 
                    'illinois': 5, 'ohio': 6, 'georgia': 7, 'virginia': 8, 
                   'rhode island': 9, 'oregon': 10, 'new jersey': 11, 
                   'massachusetts': 12, 'arizona': 13, 'california': 14, 
                   'washington': 15}

In [332]:
for state in other_states:
    frequent_states[state]=0

print(frequent_states)


def get_state_id(state):
    if isinstance(state, str):
        if state.lower().rstrip() in frequent_states:
            return frequent_states[state.lower().rstrip()]
        else:
            if 'washington' in state.lower():
                return frequent_states['washington']
            else:
                return len(set(frequent_states.values()))
    else:
        return len(set(frequent_states.values()))


train_data['state_id'] = train_data['state'].apply(get_state_id)

print(len(set(frequent_states.values())))

train_data['state_id'].value_counts()

{'texas': 1, 'florida': 2, 'wisconsin': 3, 'new york': 4, 'illinois': 5, 'ohio': 6, 'georgia': 7, 'virginia': 8, 'rhode island': 9, 'oregon': 10, 'new jersey': 11, 'massachusetts': 12, 'arizona': 13, 'california': 14, 'washington': 15, 'wyoming': 0, 'colorado': 0, 'hawaii': 0, 'tennessee': 0, 'nevada': 0, 'maine': 0, 'north dakota': 0, 'mississippi': 0, 'south dakota': 0, 'oklahoma': 0, 'delaware': 0, 'minnesota': 0, 'north carolina': 0, 'arkansas': 0, 'indiana': 0, 'maryland': 0, 'louisiana': 0, 'idaho': 0, 'iowa': 0, 'west virginia': 0, 'michigan': 0, 'kansas': 0, 'utah': 0, 'connecticut': 0, 'montana': 0, 'vermont': 0, 'pennsylvania': 0, 'alaska': 0, 'kentucky': 0, 'nebraska': 0, 'new hampshire': 0, 'missouri': 0, 'south carolina': 0, 'alabama': 0, 'new mexico': 0}
16


16    2239
0     1214
1     1008
2     1007
3      717
4      660
5      564
6      449
7      434
8      410
9      373
10     242
11     241
12     216
13     182
14     163
15     149
Name: state_id, dtype: int64

## Subject

In [333]:
frequent_subjects = {'health': 0, 'tax': 1, 'immigration': 2, 'election': 3, 
                     'education': 4, 'candidates-biography': 5, 'economy': 6, 
                     'gun': 7, 'job': 8, 'federal-budget': 6, 'energy': 9, 
                     'abortion': 10, 'foreign-policy': 6, 'state-budget': 6, 
                     'crime': 11, 'gays-and-lesbians' : 12, 'medicare' : 0, 
                     'terrorism' : 11, 'finance' : 6, 'criminal':11,
                     'transportation':13}


def get_subject_id(subject):
    if isinstance(subject, str):
        matched = [sbj for sbj in frequent_subjects if sbj in subject.lower()]
        if len(matched)>0:
            return frequent_subjects[matched[0]]
        else:
            return len(set(frequent_subjects.values())) 
    else:
        return len(set(frequent_subjects.values()))

    
train_data['subject_id'] = train_data['subject'].apply(get_subject_id)
print(len(set(frequent_subjects.values())))
train_data['subject_id'].value_counts()

14


6     2112
14    1912
0     1308
1      906
4      623
3      570
5      513
2      507
11     439
8      410
9      305
7      279
10     172
13     127
12      85
Name: subject_id, dtype: int64

## Venue

In [334]:
frequent_venues = {'news release' : 0, 'interview' : 1, 'press release' : 2, 
                   'speech' : 3, 'tv' : 4, 'tweet' : 5, 'campaign' : 6, 
                   'television' : 4, 'debate' : 7, 'news conference' : 8, 
                   'facebook' : 5, 'press conference' : 8, 'radio' : 9, 
                   'e-mail' : 10, 'email' : 10, 'mail' : 10, 'social media' : 5,
                   'twitter' : 5, 'blog':11, 'article':11,'comment':12, 'web':11}


def get_venue_id(venue):
    if isinstance(venue, str):
        matched = [ven for ven in frequent_venues if ven in venue.lower()]
        if len(matched)>0:
            return frequent_venues[matched[0]]
        else:
            return len(set(frequent_venues.values())) 
    else:
        return len(set(frequent_venues.values()))
    

train_data['venue_id'] = train_data['venue'].apply(get_venue_id)
print(frequent_venues)
train_data['venue_id'].value_counts()

{'news release': 0, 'interview': 1, 'press release': 2, 'speech': 3, 'tv': 4, 'tweet': 5, 'campaign': 6, 'television': 4, 'debate': 7, 'news conference': 8, 'facebook': 5, 'press conference': 8, 'radio': 9, 'e-mail': 10, 'email': 10, 'mail': 10, 'social media': 5, 'twitter': 5, 'blog': 11, 'article': 11, 'comment': 12, 'web': 11}


13    2699
1     1761
3     1062
7      738
6      682
4      570
11     529
5      473
10     356
2      347
12     339
0      325
8      249
9      138
Name: venue_id, dtype: int64

## Statement

In [335]:
def getCleanedText(text):
    text = text.lower()
    tokens=tokenizer.tokenize(text)
    new_tokens=[token for token in tokens if token not in en_stop]
    stemmed_tokens=[ps.stem(tokens) for tokens in new_tokens]
    clean_text=" ".join(stemmed_tokens)
    return clean_text

In [336]:
train_data['statement']=train_data['statement'].apply(getCleanedText)
train_data['statement'].head()

0    declin coal start start natur ga took start be...
1    hillari clinton agre john mccain vote give geo...
2    health care reform legisl like mandat free sex...
3                     econom turnaround start end term
4    chicago bear start quarterback last 10 year to...
Name: statement, dtype: object

In [337]:
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

In [338]:
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [339]:
train_data['Subjectivity'] = train_data['statement'].apply(getSubjectivity)
train_data['Polarity'] = train_data['statement'].apply(getPolarity)

In [340]:
# def load_statement_vocab_dict(train_data):
#     vocabulary_dict = {}
#     if not os.path.exists('vocabulary.p'):
#         tokenizer = Tokenizer()
#         tokenizer.fit_on_texts(train_data['statement'])
#         vocabulary_dict = tokenizer.word_index
#         print(len(vocabulary_dict))
#         pickle.dump(vocabulary_dict, open( "vocabulary.p", "wb" ))
#         print('Created Vocabulary Dictionary...')
#         print('Saved Vocabulary Dictionary...')
#     else:
#         print('Loading Vocabulary Dictionary...')
#         vocabulary_dict = pickle.load(open("vocabulary.p", "rb" ))
#     return vocabulary_dict


# def preprocess_statement(statement):
#     statement = [w for w in statement.split(' ') if w not in stopwords.words('english')]
#     statement = ' '.join(statement)
#     text = text_to_word_sequence(statement)
#     val = [0] * 10
#     val = [vocabulary_dict[t] for t in text if t in vocabulary_dict] 
#     return val


# vocabulary_dict = load_statement_vocab_dict(train_data)
# train_data['word_id'] = train_data['statement'].apply(preprocess_statement)

In [341]:
# import spacy
# # nlp = spacy.load('en')
# nlp = spacy.load('en_core_web_sm')

In [342]:
# !python3 -m spacy download en_core_web_sm

In [343]:
train_data.head()

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,...,venue,output,speaker_id,job_id,party_id,state_id,subject_id,venue_id,Subjectivity,Polarity
0,10540.json,half-true,declin coal start start natur ga took start be...,"energy,history,job-accomplishments",scott-surovell,state delegate,virginia,democrat,0,0,...,a floor speech.,3,17,12,1,8,8,3,0.0,0.0
1,324.json,mostly-true,hillari clinton agre john mccain vote give geo...,foreign-policy,barack-obama,president,illinois,democrat,70,71,...,denver,4,0,1,1,5,6,13,0.0,0.0
2,1123.json,false,health care reform legisl like mandat free sex...,health-care,blog-posting,,,none,7,19,...,a news release,1,13,12,2,16,0,0,0.8,0.4
3,9028.json,half-true,econom turnaround start end term,"economy,jobs",charlie-crist,,florida,democrat,15,9,...,an interview on cnn,3,14,12,1,2,6,1,0.0,0.0
4,12465.json,true,chicago bear start quarterback last 10 year to...,education,robin-vos,wisconsin assembly speaker,wisconsin,republican,0,3,...,a an online opinion-piece,5,17,9,0,3,4,13,0.294444,0.0


In [344]:
print(train_data.columns)
train_data.shape

Index(['id', 'label', 'statement', 'subject', 'speaker', 'job', 'state',
       'party', 'barely-true', 'false', 'half-true', 'mostly-true',
       'pants-fire', 'venue', 'output', 'speaker_id', 'job_id', 'party_id',
       'state_id', 'subject_id', 'venue_id', 'Subjectivity', 'Polarity'],
      dtype='object')


(10268, 23)

In [345]:
features = ['output', 'speaker_id', 'job_id', 'party_id', 'barely-true', 'false', 'half-true', 'mostly-true', 
            'pants-fire', 'state_id', 'subject_id', 'venue_id', 'Subjectivity', 'Polarity']

data = train_data[features]
data.head()

Unnamed: 0,output,speaker_id,job_id,party_id,barely-true,false,half-true,mostly-true,pants-fire,state_id,subject_id,venue_id,Subjectivity,Polarity
0,3,17,12,1,0,0,1,1,0,8,8,3,0.0,0.0
1,4,0,1,1,70,71,160,163,9,5,6,13,0.0,0.0
2,1,13,12,2,7,19,3,5,44,16,0,0,0.8,0.4
3,3,14,12,1,15,9,20,19,2,2,6,1,0.0,0.0
4,5,17,9,0,0,3,2,5,1,3,4,13,0.294444,0.0


In [346]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10268 entries, 0 to 10267
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   output        10268 non-null  int64  
 1   speaker_id    10268 non-null  int64  
 2   job_id        10268 non-null  int64  
 3   party_id      10268 non-null  int64  
 4   barely-true   10268 non-null  object 
 5   false         10268 non-null  object 
 6   half-true     10268 non-null  object 
 7   mostly-true   10268 non-null  object 
 8   pants-fire    10268 non-null  object 
 9   state_id      10268 non-null  int64  
 10  subject_id    10268 non-null  int64  
 11  venue_id      10268 non-null  int64  
 12  Subjectivity  10268 non-null  float64
 13  Polarity      10268 non-null  float64
dtypes: float64(2), int64(7), object(5)
memory usage: 1.1+ MB


In [347]:
X = data.drop('output', axis=1)
y = data.output

In [348]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [349]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8214, 13), (2054, 13), (8214,), (2054,))

In [350]:
svm = SVC()
svm.fit(X_train,y_train)

SVC()

In [351]:
svm.score(X_test, y_test)

0.3018500486854917

In [352]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from numpy import unique

In [353]:
import numpy as np

In [354]:
X = np.array(X)

In [355]:
X = X.reshape(X.shape[0], X.shape[1], 1)
print(X.shape)

(10268, 13, 1)


In [356]:
print(unique(y))
print(unique(y).sum())

[0 1 2 3 4 5]
15


In [357]:
y = to_categorical(y, 6)

In [358]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = Sequential()
model.add(Conv1D(64, 2, activation="relu", input_shape=(13,1)))
model.add(Dense(16, activation="relu"))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(3, activation = 'softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', 
     optimizer = "adam",               
              metrics = ['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_9 (Conv1D)            (None, 12, 64)            192       
_________________________________________________________________
dense_18 (Dense)             (None, 12, 16)            1040      
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 6, 16)             0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 96)                0         
_________________________________________________________________
dense_19 (Dense)             (None, 3)                 291       
Total params: 1,523
Trainable params: 1,523
Non-trainable params: 0
_________________________________________________________________


In [359]:
model.fit(X_train, y_train, batch_size=16,epochs=100, verbose=0)

acc = model.evaluate(X_train, y_train)
print("Loss:", acc[0], " Accuracy:", acc[1])

pred = model.predict(X_test)
pred_y = pred.argmax(axis=-1)

cm = confusion_matrix(y_test, pred_y)
print(cm)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).