# Text classification 


## Load libraries 


Import only relevant libraries.

In [None]:
# Numpy and Pandas 

import numpy as np
import warnings
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

# Data visualization 

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# REGEX and NLP

import re
import string
!pip install nltk
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# ML

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB # Naive-Bayes
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression # Linear models
from xgboost import XGBClassifier # XG Boost
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score # Accuracy score
from sklearn.metrics import balanced_accuracy_score # Balanced accuracy score
from sklearn.metrics import cohen_kappa_score # Cohen's Kappa score
from sklearn.utils import resample # for resampling

# Interface

import tkinter as tk
from tkinter import filedialog

warnings.filterwarnings('ignore')



## Load data


In [None]:
# Open file path

# root = tk.Tk()
# root.withdraw()

# file_path = filedialog.askopenfilename()

In [None]:

# The labeled data 

asian_sample = pd.read_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/sample_asian.csv")
black_sample = pd.read_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/sample_black.csv")

# The unlabeled data

asian_unlabeled = pd.read_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/unlabeled_asian.csv")
black_unlabeled = pd.read_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/unlabeled_black.csv")

Examine files.

In [None]:
# First five rows

asian_sample.head(5)

Select only relevant columns.

In [None]:
# Drop the first column

## Seen 

asian_sample = asian_sample.drop(['Unnamed: 0'], axis = 1)
black_sample = black_sample.drop(['Unnamed: 0'], axis = 1)

# An alternative way of doing this is asian_sample = asian_sample[['col1', 'col2']] 

## Unseen 

asian_unlabeled = asian_unlabeled.drop(['Unnamed: 0'], axis = 1)
black_unlabeled = black_unlabeled.drop(['Unnamed: 0'], axis = 1)

Convert date column into datetime. This new data type allows us to extract some info from the column. For instance, `asian_samplep['date'].year` returns years. 

In [None]:

# Seen data 
asian_sample["date"] = pd.to_datetime(asian_sample["date"])
black_sample["date"] = pd.to_datetime(black_sample["date"])

# Unseen data 
asian_unlabeled["date"] = pd.to_datetime(asian_unlabeled["date"])
black_unlabeled["date"] = pd.to_datetime(black_unlabeled["date"])


Check the balance of target values: **imbalanced**. I used a resampling method (upsampling/oversampling) to address this problem.

In [None]:
# Check the balance of target values 

asian_sample['linked_progress'].value_counts()

In [None]:
asian_sample['linked_hurt'].value_counts()

In [None]:
black_sample['linked_progress'].value_counts()

In [None]:
black_sample['linked_hurt'].value_counts()

Note that the number of labeled Asian American articles was reduced as I remove 18 duplicates from the original sample.

## Preprocessing

### Remove special characters, punctuations, whitespace, and stopwords

- I created a function for cleaning texts.
- Removing stop words did not increase performance in this case. (So, I commented it out.)

In [None]:

# stop_words = stopwords.words('english')

def clean_text(document):
    document = document.str.lower() # lower case
    document = document.str.replace('[\r?\n]','', regex = True)
    document = document.str.replace('[^\\w\\s]','', regex = True)
    document = document.str.replace('\\d+', '', regex = True)   
    document = document.str.strip() # remove whitespace
  #  document = document.apply(lambda x: " ".join([y for y in x.split() if y not in stop_words]))
    return(document)

Let's see how it works using one sample.

In [None]:
clean_text(black_sample['text']).head() # first 5 rows 

Apply the function to each corpus.

In [None]:
# Seen

asian_sample['text'] = clean_text(asian_sample['text'])
black_sample['text'] = clean_text(black_sample['text'])

# Unseen

asian_unlabeled['text'] = clean_text(asian_unlabeled['text'])
black_unlabeled['text'] = clean_text(black_unlabeled['text'])

## Feature engineering

Here, we turn texts into a document-term matrix. These terms represent features in the model and we aim to find a combination of features that are most effective in predicting target values.

### Vectorizer 

In [None]:

# Bag of Words (BOW)

vectorizer = CountVectorizer(
    max_features = 5000, # 5,000 is large enough
    min_df = 1, # minimum frequency 1 
    ngram_range = (1,2), # ngram 
    binary = True,
)


Lots of things happened here. 

- Resampling to correct the imbalanced classes: `upsampled` the minority class 
- Converting text into a `document-term matrix` 
- Splitting the matrix into the training and testing set using `stratified random sampling`

I created two functions to examine how resampling improves model performances. The `dtm_train function` does not resample the data and the `dtm_train_resample function` did. I ran both functions using the same data and compare how these two performed.


### Version 1: Creating DTM and Splitting data

In [None]:

def dtm_train(data, text, column, year):
    
    ############################### DOCUMENT-TERM MATRIX ################################
    
    # BOW model 
    
    features = vectorizer.fit_transform(data[text]).todense() # Turn into a sparse matrix    

    # Response variable
    
    response = data[column].values # values 

    ############################### STRATIFIED RANDOM SAMPLING ################################
    
    # Split into training and testing sets 

    X_train, X_test, y_train, y_test = train_test_split(features, response, 
                                                        test_size = 0.2, # training = 80%, test = 20%
                                                        random_state = 1234, # for reproducibility
                                                        stratify = data[year]) # stratifying by year
    
    # Label encode (normalize) response variable
    
    encoder = preprocessing.LabelEncoder()
    
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)

    return(X_train, y_train, X_test, y_test)


### Version 2: Creating DTM and Splitting data + Resampling

In [None]:


def dtm_train_resample(data, text, column, year):
    
    ############################### RESAMPLING ################################
    
    # Split into majority and minority classes: # I adapted some code from here: https://elitedatascience.com/imbalanced-classes 
       
    df_majority = data[data[column] == 0]
    df_minority = data[data[column] == 1]
    
    # Upsample (oversample) minority class 
    
    df_minority_upsampled = resample(df_minority, 
                                 replace = True,     # sample with replacement
                                 n_samples = 750,    # to match majority class
                                 random_state = 1234) # reproducible results
    
    # Combine majority class with upsampled minority class
    data = pd.concat([df_majority, df_minority_upsampled])
    
    ############################### DOCUMENT-TERM MATRIX ################################
    
    # BOW model 
    
    features = vectorizer.fit_transform(data[text]).todense() # Turn into a sparse matrix    

    # Response variable
    
    response = data[column].values # values 

    ############################### STRATIFIED RANDOM SAMPLING ################################
    
    # Split into training and testing sets 

    X_train, X_test, y_train, y_test = train_test_split(features, response, 
                                                        test_size = 0.2, # training = 80%, test = 20%
                                                        random_state = 1234, # for reproducibility
                                                        stratify = data[year]) # stratifying by year
    
    # Label encode (normalize) response variable
    
    encoder = preprocessing.LabelEncoder()
    
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)

    return(X_train, y_train, X_test, y_test)



### Training and testing data and response variables

I created training and testing data (text features) and their response variables using the two custom functions shown above.


In [None]:
# Asian American newspapers 

## None 

asian_lp_dtm = dtm_train(asian_sample, 'text', 'linked_progress', 'year')
asian_lh_dtm = dtm_train(asian_sample, 'text', 'linked_hurt', 'year')

## Resampled 

asian_lp_dtm_resample = dtm_train_resample(asian_sample, 'text', 'linked_progress', 'year')
asian_lh_dtm_resample = dtm_train_resample(asian_sample, 'text', 'linked_hurt', 'year')

# African American newspapers

## None 

black_lp_dtm = dtm_train(black_sample, 'text', 'linked_progress', 'year')
black_lh_dtm = dtm_train(black_sample, 'text', 'linked_hurt', 'year')

## Resampled 

black_lp_dtm_resample = dtm_train_resample(black_sample, 'text', 'linked_progress', 'year')
black_lh_dtm_resample = dtm_train_resample(black_sample, 'text', 'linked_hurt', 'year')


## Fit and evaluate a ML model

### Functions for various ML models

In [None]:
# Lasso

def fit_logistic_regression(X_train, y_train):
    model = LogisticRegression(fit_intercept = True, penalty = 'l1', solver = 'saga') # Lasso
    model.fit(X_train, y_train)
    return model

# Naive-Bayes 

def fit_bayes(X_train, y_train):
    model = GaussianNB()
    model.fit(X_train, y_train)
    return model

# XG Boost 

def fit_xgboost(X_train, y_train):
    model = XGBClassifier(random_state = 42,
                         seed = 2, 
                         colsample_bytree = 0.6,
                         subsample = 0.7)
    model.fit(X_train, y_train)
    return model


### Function for evaluating ML models (accuracy, balanced accuracy, and Cohen's kappa)

In [None]:

def test_model(model, X_train, y_train, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
#    print("Accuracy:", accuracy, "\n"
#          "Balanced accuracy:", balanced_accuracy, "\n"
#          "Cohen's Kappa:", kappa)
    return(accuracy, balanced_accuracy, kappa)


### Model fitting 

Function for fitting selected models to the data

In [None]:
def fit_models(data):
    # Lasso
    lasso = fit_logistic_regression(data[0], data[1])
    # Naive-Bayes
    bayes = fit_bayes(data[0], data[1])
    # XG Boost
    xgboost = fit_bayes(data[0], data[1])
    
    return(lasso, bayes, xgboost)

In [None]:
##### None 

In [None]:

# Asian American newspapers

asian_lp_fit = fit_models(asian_lp_dtm)
asian_lh_fit = fit_models(asian_lh_dtm)

# African American newspapers

black_lp_fit = fit_models(black_lp_dtm)
black_lh_fit = fit_models(black_lh_dtm)


In [None]:
#### Resampled 

In [None]:

# Asian American newspapers

asian_lp_fit_resample = fit_models(asian_lp_dtm_resample)
asian_lh_fit_resample = fit_models(asian_lh_dtm_resample)

# African American newspapers

black_lp_fit_resample = fit_models(black_lp_dtm_resample)
black_lh_fit_resample = fit_models(black_lh_dtm_resample)


### Model evaluations 

Function for testing multiple models.

In [None]:

def test_models(models, data):
    lasso = test_model(models[0], data[0], data[1], data[2], data[3])
    bayes = test_model(models[1], data[0], data[1], data[2], data[3])
    xgboost = test_model(models[2], data[0], data[1], data[2], data[3])
    return(lasso, bayes, xgboost)


Evaluate multiple models for each data.

In [None]:
# Asian American newspapers

asian_lp_models = test_models(asian_lp_fit, asian_lp_dtm)
asian_lp_models_sample = test_models(asian_lp_fit_resample, asian_lp_dtm_resample)

asian_lh_models = test_models(asian_lh_fit, asian_lh_dtm)
asian_lh_models_sample = test_models(asian_lh_fit_resample, asian_lh_dtm_resample)

# African American nespapers

black_lp_models = test_models(black_lp_fit, black_lp_dtm)
black_lp_models_sample = test_models(black_lp_fit_resample, black_lp_dtm_resample)

black_lh_models = test_models(black_lh_fit, black_lh_dtm)
black_lh_models_sample = test_models(black_lh_fit_resample, black_lh_dtm_resample)

## Prediction

### Function for predicting the unlabeled data

In [None]:

def test_text(text, model):   
      
    # BOW model 
    
    features = vectorizer.fit_transform(text).todense()
    
    # Prediction
    
    preds = model.predict(features)
    
    return preds

### Label the unlabeled data

In [None]:
# Asian Americans 

# asian_lp_full = test_text(asian_unlabeled['text'], asian_lp)
# asian_lh_full = test_text(asian_unlabeled['text'], asian_lh)

# African Americans 

# black_lp_full = test_text(black_unlabeled['text'], black_lp)
# black_lh_full = test_text(black_unlabeled['text'], black_lh)

## Export classification results as CSV files 

I saved the classification results as CSV files to plot them in R. 

In [None]:

# Rename new columns  

# asian_lp_data = pd.DataFrame(asian_lp_full).rename(columns = {0:'labeled_linked_progress'})
# asian_lh_data = pd.DataFrame(asian_lh_full).rename(columns = {0:'labeled_linked_hurt'})
# black_lp_data = pd.DataFrame(black_lp_full).rename(columns = {0:'labeled_linked_progress'})
# black_lh_data = pd.DataFrame(black_lh_full).rename(columns = {0:'labeled_linked_hurt'})

# Save data 

# asian_lp_data.to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/asian_lp_data.csv")
# asian_lh_data.to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/asian_lh_data.csv")
# black_lp_data.to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/black_lp_data.csv")
# black_lh_data.to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/black_lh_data.csv")

This is what the final data looks like.

In [None]:
# asian_lp_data.head()