In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
dataset = pd.read_csv('train3.csv')
data_test = pd.read_csv('test_without_labels3.csv')

In [None]:
dataset.head(5)

In [None]:
import seaborn as sns
sns.countplot(x='Label', data=dataset)

In [None]:
reviews = dataset['Content']
reviews_test = data_test['Content']

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop = stopwords.words('english2')

def lemmatize_text1(text):
    return [lemmatizer.lemmatize(w,pos="v") for w in w_tokenizer.tokenize(text)]
def lemmatize_text2(text):
    return [lemmatizer.lemmatize(w,pos="n") for w in w_tokenizer.tokenize(text)]


In [None]:
article=pd.concat([reviews,reviews_test])

In [None]:
article.shape

In [None]:
article= article.apply(lambda x: " ".join(x.lower() for x in x.split())) #lowercase
article= article.str.replace('[^\w\s]','') #remove punctuation
article = article.str.replace('\d+', '') #remove numbers 
article= article.apply(lemmatize_text1)
article= article.apply(lambda x: " ".join(x))
article= article.apply(lemmatize_text2)
article= article.apply(lambda x: " ".join(x))
article= article.apply(lambda x: " ".join(x for x in x.split() if x not in stop)) #remove stopwords


In [None]:
ytrain = dataset['Label']
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
xtrain = vectorizer.fit_transform(article[:25000])
xtest = vectorizer.transform(article[25000:])

In [None]:
xtrain.shape

# Classical ML method

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report as clrp
kf = KFold(n_splits=5)
from sklearn.svm import LinearSVC
classifier2 = LinearSVC(random_state=0, tol=1e-5)
for train_index, test_index in kf.split(xtrain,ytrain):
    classifier2.fit(xtrain[train_index],ytrain[train_index])
    ypred=classifier2.predict(xtrain[test_index])
    ytestt=ytrain[test_index]
    print(clrp(ytestt,ypred))

In [None]:
classifier2.fit(xtrain,ytrain)
y_pred = classifier2.predict(xtest)

In [None]:
data_test['Predicted']=y_pred

In [None]:
data_test=data_test.drop('Content',axis=1)

In [None]:
export_csv = data_test.to_csv (r'C:\Users\Sarah\Desktop\export_dataframe3.csv', index = None, header=True)

# Neural Network Method

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.callbacks import EarlyStopping as stoppoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
#Preprocess text
article=pd.concat([reviews,reviews_test])
ytrain = dataset['Label']

article= article.apply(lambda x: " ".join(x.lower() for x in x.split())) #lowercase
article= article.str.replace('[^\w\s]','') #remove punctuation
article = article.str.replace('\d+', '') #remove numbers 
article = article.str.replace(' br ', ' ') #remove breaks 

In [None]:
print(len(np.unique(np.hstack(article[:25000]))))

In [None]:
ytrain = dataset['Label']
result = [len(x.split()) for x in article]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))

In [None]:
#Creating embeddings
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(article[:25000])

X_train = tokenizer.texts_to_sequences(article[:25000])
X_test = tokenizer.texts_to_sequences(article[25000:])

In [None]:
#Padding
maxlen = 500

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
X_train.shape

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
model = Sequential()
model.add(Embedding(20000, 32, input_length=500))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D(2))
model.add(Flatten())
model.add(Dense(10, activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
for train_index, test_index in kf.split(xtrain,ytrain):
    model.fit(X_train[train_index], ytrain[train_index], epochs=1, batch_size=128, verbose=2)
    ypred=model.predict(X_train[test_index])
    yp = [int(round(i[0])) for i in ypred]
    ytestt=ytrain[test_index]
    print(clrp(ytestt,yp))

In [None]:
model.fit(X_train, ytrain, epochs=1, batch_size=128, verbose=2)
ypred=model.predict(X_test)
yp = [int(round(i[0])) for i in ypred]

In [None]:
predictions = pd.DataFrame()
predictions['Id'] = np.arange(0,X_test.shape[0])
predictions['Predicted']=yp
export_csv = predictions.to_csv (r'C:\Users\Sarah\Desktop\export_dataframe10.csv', index = None, header=True)