# Text classification 


## Load libraries 


Import only relevant libraries.

In [1]:
# Numpy and Pandas 

import numpy as np
import warnings
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

# Data visualization 

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# REGEX and NLP

import re
import string
!pip install nltk
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# ML

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB # Naive-Bayes
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression # Linear models
from xgboost import XGBClassifier # Xgboost

################### Validation ######################
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold

################### Vectorizer ######################
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA

################### Model evals #####################
from sklearn.metrics import accuracy_score # Accuracy score
from sklearn.metrics import balanced_accuracy_score # Balanced accuracy score
from sklearn.metrics import cohen_kappa_score # Cohen's Kappa score

################### Imbalanced data #####################
from sklearn.utils import resample # for resampling

# Interface

import tkinter as tk
from tkinter import filedialog

warnings.filterwarnings('ignore')

  import pandas.util.testing as tm




[nltk_data] Downloading package stopwords to /home/jae/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data


In [2]:
# Open file path

# root = tk.Tk()
# root.withdraw()

# file_path = filedialog.askopenfilename()

In [3]:

# The labeled data 

asian_sample = pd.read_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/sample_asian.csv")
black_sample = pd.read_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/sample_black.csv")

# The unlabeled data

asian_unlabeled = pd.read_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/unlabeled_asian.csv")
black_unlabeled = pd.read_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/unlabeled_black.csv")

Examine files. As they all have similar structures, taking a look at one of these files is enough.

In [4]:
# First five rows

asian_sample.head(5)

Unnamed: 0.1,Unnamed: 0,author,date,source,text,year,linked_progress,linked_hurt,linked_progress_gran,linked_hurt_gran,Topics_C
0,1,"Lopez, Flora",1976-04-30,International Examiner,\n\n\n\n\n\n\n\nS.P.I.C.E. is a nutritional pr...,1976,1,0,1,0,Mismatch
1,2,,1976-09-30,International Examiner,\n\n\n\n\n\n\n\nCommunity control rather than ...,1976,0,0,0,0,Mismatch
2,3,,1976-06-30,International Examiner,"\n\n\n\n\n\n\n\n""Peasants of the Second Fortre...",1976,0,0,0,0,Arts
3,4,"Chin, Doug",1976-10-31,International Examiner,\n\n\n\n\n\n\n\nMuch of what is now the Intern...,1976,0,0,0,0,Housing
4,5,"Chow, Ron",1976-02-29,International Examiner,\n\n\n\n\n\n\n\nInternational District Housing...,1976,1,0,1,0,Housing


Select only relevant columns.

In [5]:
# Drop the first column

## Seen 

asian_sample = asian_sample.drop(['Unnamed: 0'], axis = 1)
black_sample = black_sample.drop(['Unnamed: 0'], axis = 1)

# An alternative way of doing this is asian_sample = asian_sample[['col1', 'col2']] 

## Unseen 

asian_unlabeled = asian_unlabeled.drop(['Unnamed: 0'], axis = 1)
black_unlabeled = black_unlabeled.drop(['Unnamed: 0'], axis = 1)

Convert date column into datetime. This new data type allows us to extract some useful information from the column. For instance, `asian_samplep['date'].year` returns years. 

In [6]:

# Seen data 
asian_sample["date"] = pd.to_datetime(asian_sample["date"])
black_sample["date"] = pd.to_datetime(black_sample["date"])

# Unseen data 
asian_unlabeled["date"] = pd.to_datetime(asian_unlabeled["date"])
black_unlabeled["date"] = pd.to_datetime(black_unlabeled["date"])


Check the balance of target values: **imbalanced**. I will use a resampling method (upsampling/oversampling) to address this problem.

In [7]:
# Check the balance of target values 

asian_sample['linked_progress'].value_counts()

0    920
1     66
Name: linked_progress, dtype: int64

In [8]:
asian_sample['linked_hurt'].value_counts()

0    936
1     50
Name: linked_hurt, dtype: int64

In [9]:
black_sample['linked_progress'].value_counts()

0    888
1    120
Name: linked_progress, dtype: int64

In [10]:
black_sample['linked_hurt'].value_counts()

0    878
1    130
Name: linked_hurt, dtype: int64

Note that the number of labeled Asian American articles was reduced because I removed 22 duplicate observations from the original sample.

## Preprocessing

### Remove special characters, punctuations, whitespace, and stopwords

- I created a function for cleaning texts.
- Removing stop words did not increase performance in this case. (So, I commented it out.)

In [11]:

# stop_words = stopwords.words('english')

def clean_text(document):
    document = document.str.lower() # lower case
    document = document.str.replace('[\r?\n]','', regex = True)
    document = document.str.replace('[^\\w\\s]','', regex = True)
    document = document.str.replace('\\d+', '', regex = True)   
    document = document.str.strip() # remove whitespace
  #  document = document.apply(lambda x: " ".join([y for y in x.split() if y not in stop_words]))
    return(document)

Let's see how the function works using one sample.

In [12]:
clean_text(black_sample['text']).head() # first 5 rows 

0    friday nov  at  pm rev l s rubin pastor at oli...
1    we have a large building an ante bellum buildi...
2    ktvus televoters were back to being pretty upt...
3    washington dc  washingtons appointed mayor wal...
4    spokesmen for the congress of racial equality ...
Name: text, dtype: object

Apply the function to each corpus.

In [13]:
# Seen

asian_sample['text'] = clean_text(asian_sample['text'])
black_sample['text'] = clean_text(black_sample['text'])

# Unseen

asian_unlabeled['text'] = clean_text(asian_unlabeled['text'])
black_unlabeled['text'] = clean_text(black_unlabeled['text'])

## Feature engineering

Here, we turn texts into a document-term matrix. These terms represent features in the model and we aim to find a combination of features that are most effective in predicting target values.

### Vectorizer 

In [14]:

# Bag of Words (BOW)

vectorizer = CountVectorizer(
    max_features = 5000, # 5,000 is large enough
    min_df = 1, # minimum frequency 1 
    ngram_range = (1,2), # ngram 
    binary = True,
)


Lots of things happened here. 

- Resampling to correct the imbalanced classes: `upsampled` the minority class 
- Converting text into a `document-term matrix` 
- Splitting the matrix into the training and testing set using `stratified random sampling`

I created two functions to examine the extent to which resampling improves model performances. The `dtm_train function` does not resample the data and the `dtm_train_resample function` does. I ran both functions using the same data and compared their relative performances.


### Version 1: Creating DTM and Splitting data

In [15]:

def dtm_train(data, text, column, year):
    
    ############################### DOCUMENT-TERM MATRIX ################################
    
    # BOW model 
    
    features = vectorizer.fit_transform(data[text]).todense() # Turn into a sparse matrix    

    # Response variable
    
    response = data[column].values # values 

    ############################### STRATIFIED RANDOM SAMPLING ################################
    
    # Split into training and testing sets 

    X_train, X_test, y_train, y_test = train_test_split(features, response, 
                                                        test_size = 0.4, # training = 60%, test = 40%
                                                        random_state = 1234, # for reproducibility
                                                        stratify = data[year]) # stratifying by year
    
    # Label encode (normalize) response variable
    
    encoder = preprocessing.LabelEncoder()
    
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)

    return(X_train, y_train, X_test, y_test)


### Version 2: Creating DTM and Splitting data + Resampling

In [16]:


def dtm_train_resample(data, text, column, year):
    
    ############################### RESAMPLING ################################
    
    # Split into majority and minority classes: # I adapted some code from here: https://elitedatascience.com/imbalanced-classes 
       
    df_majority = data[data[column] == 0]
    df_minority = data[data[column] == 1]
    
    # Upsample (oversample) minority class 
    
    df_minority_upsampled = resample(df_minority, 
                                 replace = True,     # sample with replacement
                                 n_samples = 400,    # to match majority class
                                 random_state = 1234) # reproducible results
    
    # Combine majority class with upsampled minority class
    data = pd.concat([df_majority, df_minority_upsampled])
    
    ############################### DOCUMENT-TERM MATRIX ################################
    
    # BOW model 
    
    features = vectorizer.fit_transform(data[text]).todense() # Turn into a sparse matrix    

    # Response variable
    
    response = data[column].values # values 

    ############################### STRATIFIED RANDOM SAMPLING ################################
    
    # Split into training and testing sets 

    X_train, X_test, y_train, y_test = train_test_split(features, response, 
                                                        test_size = 0.3, # training = 70%, test = 30%
                                                        random_state = 1234, # for reproducibility
                                                        stratify = data[year]) # stratifying by year
    
    # Label encode (normalize) response variable
    
    encoder = preprocessing.LabelEncoder()
    
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.fit_transform(y_test)

    return(X_train, y_train, X_test, y_test)



### Training and testing data and response variables

I created training and testing data (text features) and their response variables using the two custom functions shown above.


In [17]:
# Asian American newspapers 

## None 

asian_lp_dtm = dtm_train(asian_sample, 'text', 'linked_progress', 'year')
asian_lh_dtm = dtm_train(asian_sample, 'text', 'linked_hurt', 'year')

## Resampled 

asian_lp_dtm_resample = dtm_train_resample(asian_sample, 'text', 'linked_progress', 'year')
asian_lh_dtm_resample = dtm_train_resample(asian_sample, 'text', 'linked_hurt', 'year')

# African American newspapers

## None 

black_lp_dtm = dtm_train(black_sample, 'text', 'linked_progress', 'year')
black_lh_dtm = dtm_train(black_sample, 'text', 'linked_hurt', 'year')

## Resampled 

black_lp_dtm_resample = dtm_train_resample(black_sample, 'text', 'linked_progress', 'year')
black_lh_dtm_resample = dtm_train_resample(black_sample, 'text', 'linked_hurt', 'year')


## Fit and evaluate a ML model

### Functions for various ML models

In [18]:
# Lasso

def fit_logistic_regression(X_train, y_train):
    model = LogisticRegression(fit_intercept = True,
                               penalty = 'l1', # Lasso 
                               solver = 'saga') # for faster algorithm
    model.fit(X_train, y_train)
    return model

# Naive-Bayes 

def fit_bayes(X_train, y_train):
    model = GaussianNB()
    model.fit(X_train, y_train)
    return model

# Xgboost

def fit_xgboost(X_train, y_train):
    model = XGBClassifier(random_state = 42,
                         seed = 2, 
                         colsample_bytree = 0.6,
                         subsample = 0.7)
    model.fit(X_train, y_train)
    return model


### Function for evaluating ML models (accuracy and balanced accuracy)

In [19]:

def test_model(model, X_train, y_train, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
#    print("Accuracy:", accuracy, "\n"
#          "Balanced accuracy:", balanced_accuracy)
    return(accuracy, balanced_accuracy)


### Model fitting 

Function for fitting selected models to the data

In [20]:
def fit_models(data):
    # Lasso
    lasso = fit_logistic_regression(data[0], data[1])
    # Naive-Bayes
    bayes = fit_bayes(data[0], data[1])
    # Xgboost
    xgboost = fit_xgboost(data[0], data[1])
    
    return(lasso, bayes, xgboost)

In [21]:
##### None 

In [22]:

# Asian American newspapers

asian_lp_fit = fit_models(asian_lp_dtm)
asian_lh_fit = fit_models(asian_lh_dtm)

# African American newspapers

black_lp_fit = fit_models(black_lp_dtm)
black_lh_fit = fit_models(black_lh_dtm)


In [23]:
#### Resampled 

In [24]:

# Asian American newspapers

asian_lp_fit_resample = fit_models(asian_lp_dtm_resample)
asian_lh_fit_resample = fit_models(asian_lh_dtm_resample)

# African American newspapers

black_lp_fit_resample = fit_models(black_lp_dtm_resample)
black_lh_fit_resample = fit_models(black_lh_dtm_resample)


### Model evaluations 

Function for testing multiple models.

In [25]:

def test_models(models, data):
    lasso = test_model(models[0], data[0], data[1], data[2], data[3]) 
    bayes = test_model(models[1], data[0], data[1], data[2], data[3])
    xgboost = test_model(models[2], data[0], data[1], data[2], data[3])
    return(lasso, bayes, xgboost)


Evaluate multiple models for each data.

In [26]:
# Asian American newspapers

asian_lp_models = test_models(asian_lp_fit, asian_lp_dtm)
asian_lp_models_resample = test_models(asian_lp_fit_resample, asian_lp_dtm_resample)

asian_lh_models = test_models(asian_lh_fit, asian_lh_dtm)
asian_lh_models_resample = test_models(asian_lh_fit_resample, asian_lh_dtm_resample)

# African American nespapers

black_lp_models = test_models(black_lp_fit, black_lp_dtm)
black_lp_models_resample = test_models(black_lp_fit_resample, black_lp_dtm_resample)

black_lh_models = test_models(black_lh_fit, black_lh_dtm)
black_lh_models_resample = test_models(black_lh_fit_resample, black_lh_dtm_resample)

Function for putting the model evaluations into a table.


In [27]:

def eval_table(data):
    table = pd.DataFrame(list(data), columns= ['Accuracy','Balanced Accuracy'])
    table.insert(loc = 0, column = 'Models', value = ['Lasso', 'Bayes', 'Xgboost'])
    return(table)


In [28]:
eval_table(asian_lp_models)

Unnamed: 0,Models,Accuracy,Balanced Accuracy
0,Lasso,0.934177,0.535678
1,Bayes,0.931646,0.5
2,Xgboost,0.931646,0.51716


In [29]:
eval_table(asian_lp_models_resample)

Unnamed: 0,Models,Accuracy,Balanced Accuracy
0,Lasso,0.984848,0.986761
1,Bayes,0.997475,0.99569
2,Xgboost,0.994949,0.993904


In [30]:
eval_table(black_lp_models)

Unnamed: 0,Models,Accuracy,Balanced Accuracy
0,Lasso,0.886139,0.556917
1,Bayes,0.888614,0.5
2,Xgboost,0.876238,0.502755


In [31]:
eval_table(black_lp_models_resample)

Unnamed: 0,Models,Accuracy,Balanced Accuracy
0,Lasso,0.935401,0.924077
1,Bayes,0.935401,0.920125
2,Xgboost,0.940568,0.923986


In [32]:

# None 

eval_table(asian_lp_models).to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/asian_lp_models.csv")
eval_table(asian_lh_models).to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/asian_lh_models.csv")

eval_table(black_lp_models).to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/black_lp_models.csv")
eval_table(black_lh_models).to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/black_lh_models.csv")

# Resampled 

eval_table(asian_lp_models_resample).to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/asian_lp_models_resample.csv")
eval_table(asian_lh_models_resample).to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/asian_lh_models_resample.csv")

eval_table(black_lp_models_resample).to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/black_lp_models_resample.csv")
eval_table(black_lh_models_resample).to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/black_lh_models_resample.csv")

## Prediction

### Function for predicting the unlabeled data

In [33]:

def test_text(text, model):   
      
    # BOW model 
    
    features = vectorizer.fit_transform(text).todense()
    
    # Prediction
    
    preds = model.predict(features)
    
    return preds

### Label the unlabeled data

In [34]:
# Asian Americans 

asian_lp_full = test_text(asian_unlabeled['text'], asian_lp_fit[0])
asian_lh_full = test_text(asian_unlabeled['text'], asian_lh_fit[0])

# African Americans 

black_lp_full = test_text(black_unlabeled['text'], black_lp_fit[0])
black_lh_full = test_text(black_unlabeled['text'], black_lh_fit[0])

Let's take a look at the number of observations for each class in the unlabeled data.

In [35]:
print(sum(asian_lp_full), sum(asian_lh_full))

print(sum(black_lp_full), sum(black_lh_full))

4348 1807
9329 16538


## Export classification results as CSV files 

I saved the classification results as CSV files to plot them in R. 

In [36]:

# Rename new columns  

asian_lp_data = pd.DataFrame(asian_lp_full).rename(columns = {0:'labeled_linked_progress'})
asian_lh_data = pd.DataFrame(asian_lh_full).rename(columns = {0:'labeled_linked_hurt'})
black_lp_data = pd.DataFrame(black_lp_full).rename(columns = {0:'labeled_linked_progress'})
black_lh_data = pd.DataFrame(black_lh_full).rename(columns = {0:'labeled_linked_hurt'})

# Save data 

asian_lp_data.to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/asian_lp_data.csv")
asian_lh_data.to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/asian_lh_data.csv")
black_lp_data.to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/black_lp_data.csv")
black_lh_data.to_csv("/home/jae/content-analysis-for-evaluating-ML-performances/processed_data/black_lh_data.csv")

This is what the final data looks like.

In [37]:
asian_lp_data.head()

Unnamed: 0,labeled_linked_progress
0,1
1,1
2,0
3,0
4,1
