In [None]:
import pandas as pd
import numpy as np
#for text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#for word embedding
#import gensim
#from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

def twenty_newsgroup_to_csv():
    newsgroups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

    df = pd.DataFrame([newsgroups_train.data, newsgroups_train.target.tolist()]).T
    df.columns = ['text', 'target']

    targets = pd.DataFrame( newsgroups_train.target_names)
    targets.columns=['title']

    out = pd.merge(df, targets, left_on='target', right_index=True)
    out['date'] = pd.to_datetime('now')
    out.to_csv('20_newsgroup.csv')

twenty_newsgroup_to_csv()

In [None]:
data = pd.read_csv('20_newsgroup.csv', index_col = 0)
# Select only 5 classes
class_mask = data["target"].isin([7,9,13,18,19])
data = data[class_mask]
data.head()

In [None]:
#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
df_train, df_test = train_test_split(data,test_size=0.1,shuffle=True, random_state=1)

In [None]:
! pip install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
x=df_train['target'].value_counts()
print(x)
sns.barplot(x.index,x)

In [None]:
df_train.isna().sum()

In [None]:
# WORD-COUNT
df_train['word_count'] = df_train['text'].apply(lambda x: len(str(x).split()))
print(df_train[df_train['target']==7]['word_count'].mean()) #Disaster tweets
print(df_train[df_train['target']==9]['word_count'].mean()) #Non-Disaster tweets

In [None]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [None]:
# PLOTTING WORD-COUNT
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,4))
train_words=df_train[df_train['target']==7]['word_count']
ax1.hist(train_words,color='red')
ax1.set_title('Cars')
train_words=df_train[df_train['target']==9]['word_count']
ax2.hist(train_words,color='green')
ax2.set_title('Baseball')
fig.suptitle('Words per post')
plt.show()

In [None]:
# CHARACTER-COUNT
df_train['char_count'] = df_train['text'].apply(lambda x: len(str(x)))
print(df_train[df_train['target']==7]['char_count'].mean()) # Cars
print(df_train[df_train['target']==9]['char_count'].mean()) # Baseball

In [None]:
#convert to lowercase, strip and remove punctuations
def preprocess(text):
    text = text.lower()
    text=text.strip()
    text=re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]',' ',text)
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text)
    text = re.sub(r'\s+',' ',text)
    return text


# STOPWORD REMOVAL
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)
#LEMMATIZATION
# Initialize the lemmatizer
wl = WordNetLemmatizer()

# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

In [None]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
df_train['clean_text'] = df_train['text'].apply(lambda x: finalpreprocess(x))
df_train.head()
df_test['clean_text'] = df_test['text'].apply(lambda x: finalpreprocess(x))

In [None]:
#SPLITTING THE TRAINING DATASET INTO TRAIN AND TEST
X_train, X_val, y_train, y_val = train_test_split(df_train["clean_text"],df_train["title"],test_size=0.2,shuffle=True, stratify=df_train["title"], random_state=2)
X_test = df_test['clean_text']
y_test = df_test['title']
# Word2Vec runs on tokenized sentences
X_train_tok= [nltk.word_tokenize(i) for i in X_train]
X_val_tok= [nltk.word_tokenize(i) for i in X_val]
X_test_tok= [nltk.word_tokenize(i) for i in X_test]

In [None]:
#Tf-Idf vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_vectors_tfidf = tfidf_vectorizer.transform(X_val)
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

NO SE USA


train_docs = [TaggedDocument(doc.split(' '), [i])
             for i, doc in enumerate(df_train.clean_text)]

#df_train['clean_text_tok']=[nltk.word_tokenize(i) for i in df_train['clean_text']]
#model = Doc2Vec(df_train['clean_text_tok'],min_count=1)
#model.train(df_train['clean_text_tok'],total_examples=model.corpus_count,epochs=model.epochs)
#vocabulary = model.wv.key_to_index
#print(vocabulary)
#X_train_vectors_w2v = model.infer_vector(X_train)


#build the model
model = Doc2Vec(vector_size=64, window=2, min_count=1, workers=8, epochs = 40)
#build vocab
model.build_vocab(train_docs)
#train model
model.train(train_docs, total_examples=model.corpus_count, epochs=model.epochs)

#w2v = dict(zip(model.wv.index_to_key, model.wv.syn0))
#modelw = MeanEmbeddingVectorizer(w2v)
# converting text to numerical data using Word2Vec
#X_train_vectors_w2v = modelw.transform(X_train_tok)
#X_val_vectors_w2v = modelw.transform(X_test_tok)

NO SE USA

# Obtain vectors via doc2vec

#obtain vectors
X_test_w2v = [model.infer_vector((X_test))]
X_test_w2v
X_train_w2v = [model.infer_vector((df_train['clean_text'][i].split(' ')))
            for i in range(0,len(df_train['clean_text']))]





In [None]:
#FITTING THE CLASSIFICATION MODEL using Logistic Regression(tf-idf)
lr_tfidf=LogisticRegression(solver = 'liblinear', C=10, penalty = 'l2')
lr_tfidf.fit(X_train_vectors_tfidf, y_train)
#Predict y value for test dataset
y_predict = lr_tfidf.predict(X_test_vectors_tfidf)
y_prob = lr_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:\n',confusion_matrix(y_test, y_predict))

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, balanced_accuracy_score

lr_acc = accuracy_score(y_test, y_predict)
lr_f1 = f1_score(y_test, y_predict, average ='weighted')
lr_bacc = balanced_accuracy_score(y_test, y_predict)

print(f"Test Accuracy of Logistic Regression(tf-idf) is {lr_acc} \n")
print(f"Test Balanced Accuracy of Logistic Regression(tf-idf) is {lr_bacc} \n")
print(f"Test F1-score weighted of Logistic Regression(tf-idf) is {lr_f1} \n")

In [None]:
#FITTING THE CLASSIFICATION MODEL using Naive Bayes(tf-idf)
nb_tfidf = MultinomialNB()
nb_tfidf.fit(X_train_vectors_tfidf, y_train)
#Predict y value for test dataset
y_predict = nb_tfidf.predict(X_test_vectors_tfidf)
y_prob = nb_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:\n',confusion_matrix(y_test, y_predict))

nb_acc = accuracy_score(y_test, y_predict)
nb_f1 = f1_score(y_test, y_predict, average ='weighted')
nb_bacc = balanced_accuracy_score(y_test, y_predict)

print(f"Test Accuracy of Naive Bayes(tf-idf) is {nb_acc} \n")
print(f"Test Balanced Accuracy of Naive Bayes(tf-idf) is {nb_bacc} \n")
print(f"Test F1-score weighted of Naive Bayes(tf-idf) is {nb_f1} \n")

In [None]:
svc_tfidf = SVC()
svc_tfidf.fit(X_train_vectors_tfidf, y_train)
#Predict y value for test dataset
y_predict = svc_tfidf.predict(X_test_vectors_tfidf)
#y_prob = svc_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:\n',confusion_matrix(y_test, y_predict))

svm_acc = accuracy_score(y_test, y_predict)
svm_f1 = f1_score(y_test, y_predict, average ='weighted')
svm_bacc = balanced_accuracy_score(y_test, y_predict)

print(f"Test Accuracy of Support Vector Machine (tf-idf) is {svm_acc} \n")
print(f"Test Balanced Accuracy of Support Vector Machine (tf-idf) is {svm_bacc} \n")
print(f"Test F1-score weighted of Support Vector Machine (tf-idf) is {svm_f1} \n")

In [None]:
dt_tfidf = DecisionTreeClassifier()
dt_tfidf.fit(X_train_vectors_tfidf, y_train)
#Predict y value for test dataset
y_predict = dt_tfidf.predict(X_test_vectors_tfidf)
#y_prob = dt_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:\n',confusion_matrix(y_test, y_predict))

dt_acc = accuracy_score(y_test, y_predict)
dt_f1 = f1_score(y_test, y_predict, average ='weighted')
dt_bacc = balanced_accuracy_score(y_test, y_predict)

print(f"Test Accuracy of Decision Tree(tf-idf) is {dt_acc} \n")
print(f"Test Balanced Accuracy of Decision Tree(tf-idf) is {dt_bacc} \n")
print(f"Test F1-score weighted of Decision Tree(tf-idf) is {dt_f1} \n")

In [None]:
nn_tfidf = MLPClassifier()
nn_tfidf.fit(X_train_vectors_tfidf, y_train)
#Predict y value for test dataset
y_predict = nn_tfidf.predict(X_test_vectors_tfidf)
#y_prob = dt_tfidf.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:\n',confusion_matrix(y_test, y_predict))

nn_acc = accuracy_score(y_test, y_predict)
nn_f1 = f1_score(y_test, y_predict, average ='weighted')
nn_bacc = balanced_accuracy_score(y_test, y_predict)

print(f"Test Accuracy of Neural Network(tf-idf) is {nn_acc} \n")
print(f"Test Balanced Accuracy of Neural Network(tf-idf) is {nn_bacc} \n")
print(f"Test F1-score weighted of Neural Network(tf-idf) is {nn_f1} \n")

In [None]:
X_train_vectors_tfidf = X_train_vectors_tfidf.toarray()
X_val_vectors_tfidf = X_val_vectors_tfidf.toarray()
X_test_vectors_tfidf = X_test_vectors_tfidf.toarray()

In [None]:
from keras import Sequential
from keras.layers import Dense, Dropout
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder

In [None]:
# one hot encode
encoder = LabelEncoder()
encoder.fit(y_train)
y_train = tf.keras.utils.to_categorical(encoder.transform(y_train))
y_val = tf.keras.utils.to_categorical(encoder.transform(y_val))
y_test = tf.keras.utils.to_categorical(encoder.transform(y_test))
y_train

In [None]:
model = Sequential()
model.add(Dense(16, activation='elu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X_train_vectors_tfidf, y_train,
                    epochs=10, validation_data=(X_val_vectors_tfidf, y_val))

In [None]:
#Predict y value for test dataset
y_predict = model.predict(X_test_vectors_tfidf)
print(classification_report(y_test.argmax(axis=1), y_predict.argmax(axis=1)))
print('Confusion Matrix:\n',confusion_matrix(y_test.argmax(axis=1), y_predict.argmax(axis=1)))

acc = accuracy_score(y_test.argmax(axis=1), y_predict.argmax(axis=1))
f1 = f1_score(y_test.argmax(axis=1), y_predict.argmax(axis=1), average ='weighted')
#bacc = balanced_accuracy_score(y_test.argmax(axis=1), y_predict.argmax(axis=1))

print(f"Test Accuracy of Neural Network(tf-idf) is {acc} \n")
#print(f"Test Balanced Accuracy of Naive Bayes(tf-idf) is {bacc} \n")
print(f"Test F1-score weighted of Neural Network(tf-idf) is {f1} \n")