In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB,BernoulliNB,ComplementNB,MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, Perceptron
#from google.colab import files
from sklearn import preprocessing 
from sklearn.feature_selection import chi2,SelectKBest
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer

%run -i "preprocess.py"

In [32]:
from sklearn.feature_extraction.text import TfidfTransformer

train_fake_bow_url = "train_fake_bow.csv"
train_true_bow_url = "train_true_bow.csv"

tfidf = TfidfTransformer()
#test data
test_url = "test_bow.csv"
test_data = pd.read_csv(test_url, header=None, sep=",")
test_data = tfidf.fit_transform(test_data)
test_df = pd.DataFrame(test_data.todense())

train_fake_data = pd.read_csv(train_fake_bow_url, header=None, sep=",")
train_fake_data = tfidf.fit_transform(train_fake_data)
train_fake_df = pd.DataFrame(train_fake_data.todense())

train_true_data = pd.read_csv(train_true_bow_url, header=None, sep=",")
train_true_data = tfidf.fit_transform(train_true_data)
train_true_df = pd.DataFrame(train_true_data.todense())

Class_fake = np.zeros(len(train_fake_df)).astype(int)
Class_true = np.ones(len(train_true_df)).astype(int)

train_fake_df["Class"] = Class_fake
train_true_df["Class"] = Class_true

train_df = train_fake_df.append(train_true_df,ignore_index=True)

X = train_df.iloc[:,:-1].values
X_unseen = test_df.values

y = train_df.Class.values
y_minus = np.where(y == 1,1,-1)

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.4, 
                                                    random_state=13,
                                                    stratify=y)

y_train_minus = np.where(y_train == 1,1,-1)
y_test_minus = np.where(y_test == 1,1,-1)

In [31]:
train_true_df.head()

Unnamed: 0,0,Class
0,"(0, 1993)\t0.1802671471272398\n (0, 1982)\t...",1
1,"(0, 1962)\t0.24542988746111152\n (0, 1909)\...",1
2,"(0, 1933)\t0.277871589660909\n (0, 1775)\t0...",1
3,"(0, 1997)\t0.2692564048580432\n (0, 1995)\t...",1
4,"(0, 1999)\t0.1434512230850552\n (0, 1899)\t...",1


In [33]:
from sklearn.neural_network import MLPClassifier

m = MLPClassifier(hidden_layer_sizes = (100,), solver= "adam")
                  #early_stopping=True,warm_start=True)

#train over 3 epochs
for i in range(1,4):
    m.fit(X,y_minus)
    print(m.loss_)
    
y_pred_nn = m.predict(X_unseen)

y_pred_nn_output = np.where(y_pred_nn==1,1,0)
#formatting
y_unseen_pred_df = pd.DataFrame(y_pred_nn_output,columns=["Category"])
y_unseen_pred_df.index += 1
y_unseen_pred_df.index.name = "Id"
y_unseen_pred_df.Category = y_unseen_pred_df.Category.astype(int)

y_unseen_pred_df.to_csv("output.csv")

print("output ready")

0.0026061888375363443
0.0024367417868171037
0.0026814290843389854
output ready


In [34]:
#retrain on all train data
model = AdaBoostClassifier(GaussianNB(),n_estimators=1,learning_rate=0.35)
model.fit(X,y_minus)
y_pred_boost_nb = model.predict(X_unseen)

model = AdaBoostClassifier(LinearSVC(),algorithm="SAMME")
model.fit(X,y_minus)
y_pred_linsvc = model.predict(X_unseen)

model = AdaBoostClassifier(Perceptron(),n_estimators=10,learning_rate=0.3,algorithm="SAMME")
model.fit(X,y_minus)
y_pred_boost_perceptron = model.predict(X_unseen)

y_pred_ensemble = np.zeros(len(y_pred_boost_nb))

#majority vote
for i in range(len(y_pred_boost_nb)):
    if y_pred_boost_perceptron[i] + y_pred_linsvc[i] + y_pred_boost_nb[i] > 0: 
        y_pred_ensemble[i] = 1
    else:  y_pred_ensemble[i] = 0

#formatting
y_unseen_pred_df = pd.DataFrame(y_pred_ensemble,columns=["Category"])
y_unseen_pred_df.index += 1
y_unseen_pred_df.index.name = "Id"
y_unseen_pred_df.Category = y_unseen_pred_df.Category.astype(int)

y_unseen_pred_df.to_csv("output.csv")


In [22]:
y_pred_linsvc_output = np.where(y_pred_boost_perceptron==1,1,0)
#formatting
y_unseen_pred_df = pd.DataFrame(y_pred_linsvc_output,columns=["Category"])
y_unseen_pred_df.index += 1
y_unseen_pred_df.index.name = "Id"
y_unseen_pred_df.Category = y_unseen_pred_df.Category.astype(int)

y_unseen_pred_df.to_csv("output.csv")


In [19]:
%run -i "tfidf.py"