In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import os
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

from string import punctuation
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report,confusion_matrix

[nltk_data] Downloading package stopwords to /home/kvats/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/kvats/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/kvats/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [22]:
first_n_words = 200
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

In [25]:
def dataset_split(path='raw/',ratio=(0.7,0.15,0.15))->pd.DataFrame:
    """split raw csv files into train, validation and test sets

    Args:
        path (str, optional): path of raw files. Defaults to '../raw/'.
        ratio (tuple, optional): splitting ratio. Defaults to (0.7,0.15,0.15).

    Returns:
        pd.DataFrame: train, validation and test sets
    """    
    assert sum(ratio)==1.0 and len(ratio)==3, "ratio error"
    true_df = pd.read_csv(path+'Fake.csv')
    fake_df = pd.read_csv(path+'True.csv')
    true_df["label"] = 1
    fake_df["label"] = 0
    df = pd.concat([true_df, fake_df], ignore_index=True)
    df['titletext'] = df['title'] + ". " + df['text']
    df['text'] = df['text'].apply(trim_string)
    df['titletext'] = df['titletext'].apply(trim_string) 
    df = df.reindex(columns=['label', 'title', 'text', 'titletext'])
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    train_df, temp_df = train_test_split(df, test_size=ratio[2]+ratio[1], random_state=42, stratify=df["label"])
    val_df, test_df = train_test_split(temp_df, test_size=ratio[2]/(ratio[2]+ratio[1]), random_state=42, stratify=temp_df["label"])
    return train_df,val_df,test_df

In [4]:
os.chdir('..')
os.getcwd()

'/home/kvats/private/nw/ECE-143-Final-Project'

In [32]:
df_train, df_val, df_test = dataset_split(ratio = (0.1, 0.8, 0.1))

In [33]:
#removing puctuiation
def remove_punctuations(string):
    return ''.join(c for c in string if c not in punctuation)

#removing stopword
stop_words = set(stopwords.words('english'))
def remove_stopwords(string):    
    tokenized = word_tokenize(string)
    filtered_sentence = [word for word in tokenized if not word in stop_words]
    return ' '.join(c for c in filtered_sentence)

In [34]:
# apply to lowercase
df_train['titletext'] = df_train.apply(lambda row: str(row['titletext']).lower(), axis=1)
df_test['titletext'] = df_test.apply(lambda row: str(row['titletext']).lower(), axis=1)

# apply remove punctuation
df_train['titletext'] = df_train.apply(lambda row: remove_punctuations(row['titletext']), axis=1)
df_test['titletext'] = df_test.apply(lambda row: remove_punctuations(row['titletext']), axis=1)

# apply remove stopword
df_train['titletext'] = df_train.apply(lambda row: remove_stopwords(row['titletext']), axis=1)
df_test['titletext'] = df_test.apply(lambda row: remove_stopwords(row['titletext']), axis=1)

In [35]:
vect = CountVectorizer(min_df=2, ngram_range=(1, 1))

X_train = vect.fit(df_train['titletext']).transform(df_train['titletext']) 
X_test = vect.transform(df_test['titletext'])

y_train = df_train['label']
y_test = df_test['label']

In [38]:
parameters = [
  {'penalty': ['l2'], 'solver': ['lbfgs'], 'C' : [0.5]},
 ]

In [39]:
lr = LogisticRegression()

tuning_model = GridSearchCV(lr, parameters, cv=5, verbose=2)
tuning_model.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 

In [None]:
print("best parameters: {}".format(tuning_model.best_params_))
print("best score:      {:0.5f}".format(tuning_model.best_score_))

In [None]:
model_final = LogisticRegression(solver = "lbfgs", penalty="l2", C=0.5)
model_final.fit(X_train,y_train)
y_pred = model_final.predict(X_test)
print(classification_report(y_test,y_pred))