In [8]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import string
import re

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
nltk.download("punkt")
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn import metrics

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv("../data/raw/blogtext.csv")

In [3]:
pd.set_option("display.max_colwidth", 100)
df.head()

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [None]:
df.dropna()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.isna().any()

In [None]:
df.describe().T

In [None]:
df.info()

In [None]:
df.columns

In [None]:
# Creating a new dataframe with two columns
df = df[["topic", "text"]]

In [None]:
df["topic"].value_counts()

In [None]:
options = ["Technology", "Arts", "Education", "Communications-Media", "Internet", "Non-Profit", "Engineering"]
df = df.loc[df["topic"].isin(options)]
df

In [None]:
df["topic"].value_counts()

In [None]:
df["topic_id"] = df["topic"].factorize()[0]

topic_id_df = df[["topic", "topic_id"]].drop_duplicates().sort_values("topic_id")

topic_to_id = dict(topic_id_df.values)
id_to_topic = dict(topic_id_df[["topic_id", "topic"]].values)

In [None]:
df.head()

In [None]:
df["topic_id"].value_counts()

In [None]:
# Visualize the data

fig = plt.figure(figsize=(8,6))
df.groupby('topic').text.count().plot.bar(ylim=0)
plt.ylabel("Number of ocurrences", fontsize = 10);
plt.show()

In [None]:
df.head()

In [None]:
# Pre-processing the "text" column
string.punctuation

In [None]:
# removing punctuation

def remove_punctuation(text):
    text = "".join([c for c in text if c not in string.punctuation])
    return text

df["text"] = df["text"].apply(lambda x: remove_punctuation(x))
df.head()

In [None]:
# removing unwanted characters
df["text"] = df["text"].apply(lambda x: re.sub('[^A-Za-z0-9]+', " ", x))
df.head()

In [None]:
# removing whitespaces (begining and trailing spaces)
df["text"] = df["text"].apply(lambda x: x.strip())
df.head()

In [None]:
# replacing more than 1 spaces with single space
df["text"] = df["text"].str.replace(r"\s\s+", " ")
df.head()

In [None]:
# tokenization
def tokenize(text):
    tokens = re.split("\W+", text)
    return tokens

df["text"] = df["text"].apply(lambda x: tokenize(x.lower()))
df.head()

In [None]:
# removing stopwords

stopwords = nltk.corpus.stopwords.words("english")
stopwords.append("urllink")
stopwords[0:10]

In [None]:
def remove_stopwords(text):
    text = [word for word in text if word not in stopwords]
    return text

df["text"] = df["text"].apply(lambda x: remove_stopwords(x))
df.head()

In [None]:
# lemmatization

wn = nltk.WordNetLemmatizer()

def lemmatization(text):
    text = [wn.lemmatize(word) for word in text]
    return " ".join(text)

df["text"] = df["text"].apply(lambda x: lemmatization(x))
df.head()

In [None]:
# Further Text processing using TF-IDF

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, max_features=5000, ngram_range=(1,2), stop_words="english")

# Transforming each text into a vector
features = tfidf.fit_transform(df.text).toarray()
labels = df.topic_id
print(features.shape)

In [None]:
tfidf.vocabulary_

In [None]:
# Finding the two most correlated terms with each of the sign categories
N = 2
for topic, topic_id in sorted(topic_to_id.items()):
    features_chi2 = chi2(features, labels == topic_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names_out())[indices]
    unigrams = [v for v in feature_names if len(v.split(" ")) == 1]
    bigrams = [v for v in feature_names if len(v.split(" ")) == 2]
    print("n-----> %s:" %(topic))
    print("  * Most Correlated Unigrams are: %s" %(", ".join(unigrams[-N:])))
    print("  * Most Correlated Bigrams are: %s" %(", ".join(bigrams[-N:])))

In [None]:
X = df["text"] # Collection of blogs
y = df["topic_id"] # Target or the labels we want to predict (i.e. the different topics)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Model selection

models = [
    RandomForestClassifier(n_estimators=200, random_state=42),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=42),
]
# Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring="accuracy", cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=["model_name", "fold_idx", "accuracy"])