# Text classification Project

Library include and download

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
import nltk

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hexiantao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hexiantao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Text preprocessing

In [5]:
def get_data_from_file(path_file):
    with open(path_file,'r') as file_input:
        csv_reader=csv.reader(file_input, delimiter=',')
        next(csv_reader)
        x_list=[]
        y_list=[]
        for row in csv_reader:
            x_list.append(row[2])
            y_list.append(row[1])
    return x_list, y_list

In [6]:
def text_process(x_list):
    porter_stemmer=PorterStemmer()
    x_list_new=[]
    for x in x_list:
        # Lower canse#
        x=x.lower()
        # tokenize #
        x_token_list=word_tokenize(x)
        # remove stopwords #
        x_token_list=[token for token in x_token_list if token not in stopwords.words('english')]
        # stemming #
        x_token_list=[porter_stemmer.stem(token) for token in x_token_list]
        
        x_new=''.join(x_token_list)
        x_list_new.append(x_new)   
    return x_list_new

In [7]:
def preprocess():
    path_train=('desktop/train.csv')
    path_test=('desktop/test.csv')
    X_train,y_train=get_data_from_file(path_train)
    X_test,y_test=get_data_from_file(path_test)
    X_train=text_process(X_train)
    X_test=text_process(X_test)
    return X_train,y_train,X_test,y_test

In [8]:
## The main structure 
## Important
## X_train,y_train,X_test,y_test=preprocess()

# Feature extract

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import mutual_info_classif
from numpy import argmax, argsort, array

In [10]:
def get_feature_binary(sample_list):
    Vectorizer=CountVectorizer()
    # data matrix
    data=Vectorizer.fit_transform(sample_list).todense()
    data[data>0]=1
    return data

In [11]:
## the main structure
## important
## result=get_feature_binary(X_train)
## result_test=get_feature_binary(X_test)

# Classification by model

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [13]:
def classify_NaiveBayesian(x,y,X_test):
    # train
    nbayes=MultinomialNB()
    nbayes.fit(x,y)
    # test
    y_predict=nbayes.predict(X_test)
    return y_predict

In [15]:
def classify_Logistic(x,y,X_test):
    # train
    logistic=LogisticRegression()
    logistic.fit(x,y)
    # test
    y_predict=logistic.predict(X_test)
    return y_predict

In [16]:
def classify_SVM(x,y,X_test):
      # train
    SVM=SVC()
    SVM.fit(x,y)
    # test
    y_predict=SVM.predict(X_test)
    return y_predict

In [14]:
def evaluation(y_pred,y_test):
    print(classification_report(y_pred,y_test,target_names=['not racist/sexist','racist/sexist'],digits=3)) # how accuracy you want to know
    return 

# Naive bayes model evaluation

In [17]:
## main structure ##
X_train,y_train,X_test,y_test=preprocess()
result=get_feature_binary(X_train+X_test)
X_train=result[:len(X_train)]
X_test=result[len(X_train):]
y_predict=classify_NaiveBayesian(X_train,y_train,X_test)

In [18]:
evaluation(y_predict,y_test)

                   precision    recall  f1-score   support

not racist/sexist      0.880     0.963     0.920       435
    racist/sexist      0.333     0.123     0.180        65

        micro avg      0.854     0.854     0.854       500
        macro avg      0.607     0.543     0.550       500
     weighted avg      0.809     0.854     0.824       500



# Logistic Regression model evaluation

In [19]:
## main structure ##
X_train,y_train,X_test,y_test=preprocess()
result=get_feature_binary(X_train+X_test)
X_train=result[:len(X_train)]
X_test=result[len(X_train):]
y_predict=classify_Logistic(X_train,y_train,X_test)



In [20]:
evaluation(y_predict,y_test)

                   precision    recall  f1-score   support

not racist/sexist      1.000     0.954     0.976       499
    racist/sexist      0.042     1.000     0.080         1

        micro avg      0.954     0.954     0.954       500
        macro avg      0.521     0.977     0.528       500
     weighted avg      0.998     0.954     0.975       500



# SVM model evaluation

In [21]:
## main structure ##
X_train,y_train,X_test,y_test=preprocess()
result=get_feature_binary(X_train+X_test)
X_train=result[:len(X_train)]
X_test=result[len(X_train):]
y_predict=classify_SVM(X_train,y_train,X_test)



In [22]:
evaluation(y_predict,y_test)

                   precision    recall  f1-score   support

not racist/sexist      1.000     0.952     0.975       500
    racist/sexist      0.000     0.000     0.000         0

        micro avg      0.952     0.952     0.952       500
        macro avg      0.500     0.476     0.488       500
     weighted avg      1.000     0.952     0.975       500



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
