In [11]:
# Import Libraries

import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def data_cleaning(input_file):

    # Read the CSV file into a Pandas DataFrame
    df = pd.read_csv(input_file, encoding='utf-8')

    # Remove rows with all empty values
    df.dropna(how='all', inplace=True)

    # Drop duplicate rows
    df.drop_duplicates(inplace=True)

    # Remove rows with empty values in a specific column
    df.dropna(subset=['Tag'], inplace=True)

    # Remove leading and trailing spaces from each column
    df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

    return df


# Dictionary for storing all unique words in df["Result"]
unique_words = {}

def data_preprocessing(df):

    # Converting everthing into string datatype.
    df['Above Text'] = df['Above Text'].astype(str)
    df['Below Text'] = df['Below Text'].astype(str)
    df['Row Text'] = df['Row Text'].astype(str)
    df['Col Text'] = df['Col Text'].astype(str)

    # Concatenate the values in the "Above Text", "Below Text", "Row Text", "Col Text" columns with a space.
    df['Result'] = df[['Above Text', 'Row Text', 'Col Text', 'Below Text']].apply(lambda x: ' '.join(x), axis=1)

    # define a lambda function to convert camelcase string to normal string
    to_normal_string = lambda s: ''.join([' ' + i.lower() if i.isupper() else i for i in s]).lstrip(' ')

    # apply the lambda function to the column
    df['Tag'] = df['Tag'].apply(to_normal_string)

    # Convert the text to lowercase
    df['Result'] = df['Result'].str.lower()
    df['Tag'] = df['Tag'].str.lower()

    # Remove punctuation
    df['Result'] = df['Result'].str.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the strings into individual words
    df['Result'] = df['Result'].apply(word_tokenize)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    stop_words.add('nan')
    df['Result'] = df['Result'].apply(lambda x: [word for word in x if word not in stop_words])

    # Storing all unique words in dictionary (unique_words)
    if (len(unique_words) == 0):
        k = 0
        for i in df['Result']:
                for j in i:
                    if j not in unique_words:
                        unique_words[j] = k
                        k += 1

    # Join the stemmed words back into a string
    df['Result'] = df['Result'].apply(lambda x: ' '.join(x))

    return df


def vectorizer(df):

    # TF-IDF vectorization
    v = TfidfVectorizer(vocabulary = unique_words)
    x = v.fit_transform(df['Result'])
    y = df['Tag']
    x = x.toarray()

    return x, y


def naive_bayes(x_train, y_train, x_test, y_test):

    # Train a Naive Bayes model on the preprocessed training data
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)

    # Test
    y_pred = gnb.predict(x_test)
    
    # Evaluate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Naive Bayes Accuracy: {accuracy:.2f}")


def decision_trees(x_train, y_train, x_test, y_test):

    # Train a Naive Bayes model on the preprocessed training data
    dt = DecisionTreeClassifier()
    dt.fit(x_train, y_train)

    # Test
    y_pred = dt.predict(x_test)
    
    # Evaluate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Decision Trees Accuracy: {accuracy:.2f}")


def svm(x_train, y_train, x_test, y_test):

    # Train a Naive Bayes model on the preprocessed training data
    svm = SVC(kernel='linear', C=1, random_state=1)
    svm.fit(x_train, y_train)

    # Test
    y_pred = svm.predict(x_test)
    
    # Evaluate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"SVM Accuracy: {accuracy:.2f}")


def knn(x_train, y_train, x_test, y_test):

    # Train a Naive Bayes model on the preprocessed training data
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(x_train, y_train)

    # Test
    y_pred = knn.predict(x_test)
    
    # Evaluate the accuracy of the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"KNN Accuracy: {accuracy:.2f}")


In [13]:
# Training data file
train_file = "train.csv"

# Training data cleaning and preprocessing
cleaned_tr_df = data_cleaning(train_file)
preprocessed_tr_df = data_preprocessing(cleaned_tr_df)

# Vectorization of training data
x_train, y_train = vectorizer(preprocessed_tr_df)

# Testing data file
test_file = "test.csv"

# Testing data cleaning and preprocessing
cleaned_ts_df = data_cleaning(test_file)
preprocessed_ts_df = data_preprocessing(cleaned_ts_df)

# Vectorization of training data
x_test, y_test = vectorizer(preprocessed_ts_df)


# Training and Testing on Naive Bayes
naive_bayes(x_train, y_train, x_test, y_test)

# Training and Testing on Decision Trees
decision_trees(x_train, y_train, x_test, y_test)

# Training and Testing on SVM
svm(x_train, y_train, x_test, y_test)

# Training and Testing on KNN
knn(x_train, y_train, x_test, y_test)

Naive Bayes Accuracy: 0.57
Decision Trees Accuracy: 0.77
SVM Accuracy: 0.81
KNN Accuracy: 1.00
