# Text classification 

[Carlos Ortiz](https://www.linkedin.com/in/carlosortizdev/), [Sarah Santiago](https://www.linkedin.com/in/sarah-santiago-7a297b18a/), and [Vivek Datta](https://www.linkedin.com/in/vivek-datta/) did initial coding. Jae Yeon Kim reviewed and modified the code. Please use the `Python2` kernel to run this notebook. 

## Import libraries

In [None]:
#Include relevant imports here

from collections import Counter

# Data manipulation 
import numpy as np
import warnings
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

# Data visualization 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, balanced_accuracy_score, cohen_kappa_score, precision_score, recall_score

warnings.filterwarnings('ignore')

In [None]:
## Import data 

In [None]:
# Read in scraped articles from csv file to dataframe
articles = pd.read_csv('/home/jae/ITS-Text-Classification/processed_data/cleaned_text.csv')

placebo_articles = pd.read_csv('/home/jae/ITS-Text-Classification/processed_data/placebo.csv')

## Feature extraction

In [None]:

# Conduct train-test split where 90% of data used for training

train, val = train_test_split(articles, 
                              test_size = 0.1,
                              random_state = 42) # for reproducibility

## Exploratory data analysis

In [None]:
# Count all of the words that are in NA vs not in NA

NA_count = Counter(" ".join(train[train['category'] == 0]['text']).split())

not_NA_count = Counter(" ".join(train[train['category'] == 1]['text']).split())

# Creating dataframe named testing to hold these counts

testing = pd.DataFrame({"NA": NA_count, "Not NA": not_NA_count}).fillna(0)

testing.head()

In [None]:
# Make column of times NA appeared more than not NA
testing['NA distinct'] = testing['NA'] - testing['Not NA']
testing = testing.sort_values(by = 'NA distinct', ascending = False)

# Make column of percentage of words
testing['NA percent'] = testing['NA'] / (testing['NA'] +  testing['Not NA'])
tester = testing[testing['Not NA'] > 0].sort_values(by = 'NA percent', ascending = False)

# Above work leads to creating of the top 100 words that appear in NA articles, as seen in words areray
words = tester[tester['NA'] > 100].sort_values(by = 'NA distinct', ascending = False).head(300).reset_index()['index'].values.astype(str)
words[:10]

## Feature engineering

In [None]:
# Create counts of how often the number of times top 100 words appeared in all articles, with 1 and 2 grams
cv = CountVectorizer(vocabulary=words, ngram_range = (1, 2))

# Encode the count vectorizer to create a dataframe holding counts
ohc_words_x_train = pd.DataFrame(cv.fit_transform(train['text']))

ohc_words_x_train.head()

In [None]:
# Get addition features from one hot encoding the source, intervention, and group columns
features_x_train = pd.concat([pd.get_dummies(train[col]) for col in ['source', 'intervention', 'group']], axis=1)
features_x_train = features_x_train.drop(columns = ["The Arab American View"])
features_x_train.head()

In [None]:
# Create final training matrix from the above two dataframes created from cells
xx_train = pd.merge(left=ohc_words_x_train, left_index=True
                    ,right=features_x_train, right_index=True,
                    how='inner')
xx_train.head()

## Training and testing lasso (least absolute shrinkage and selection operator)

In [None]:

# Fit our Logistic Regression model with L1 regularization and determine the training accuracy

yy_train = train['category']

NA_model = LogisticRegressionCV(fit_intercept = True, 
                                penalty = 'l1', 
                                solver = 'saga')

NA_model.fit(xx_train, yy_train)

accuracy = NA_model.score(xx_train, yy_train)

print("Training Accuracy: ", accuracy)

In [None]:
# Repeat the above process for word count matrix in testing
cv = CountVectorizer(vocabulary=words, 
                     ngram_range = (1, 2))

ohc_words_x_test = pd.SparseDataFrame(cv.fit_transform(val['text']), 
                       val.index,
                       cv.get_feature_names(), 
                       default_fill_value=0)

ohc_words_x_test.head()

In [None]:

# Repeat the above process for feature matrix in testing 

features_x_test = pd.concat([pd.get_dummies(val[col]) for col in ['source', 'intervention', 'group']], axis=1)

features_x_test = features_x_test.drop(columns = ["The Arab American View"])

features_x_test.head()

In [None]:

# Merging above dataframes to get the final testing matrix 

xx_test = pd.merge(left=ohc_words_x_test, left_index=True
                  ,right=features_x_test, right_index=True,
                   how='inner')

xx_test.head()

In [None]:

def test_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return(accuracy, precision, recall)


In [None]:
test_results = test_model(NA_model, x_test = xx_test, y_test = val['category'])


In [None]:
table = pd.DataFrame(test_results, columns = ['Scores'])

table['Metrics'] = ['Accuracy', 'Precision', 'Recall']

table


## Label the unlabeled data 

In [None]:

# Get the unlabeled data and follow procedures in the beginning of notebook to clean

unlabeled_articles = pd.read_csv('/home/jae/ITS-Text-Classification/raw_data/unlabeled_articles.csv', index_col=0)

unlabeled_articles['date'] =  pd.to_datetime(unlabeled_articles['date'], format='%Y%m%d')

unlabeled_articles.head()

In [None]:
# Categorize whether article pertains to Arab-American or Indian-American issues based on publication source

response = []

for x in unlabeled_articles['source']:
    if x == "The Arab American News":
        response.append("Arab")
    else:
        response.append("Indian")

unlabeled_articles['group'] = response

unlabeled_articles.head()

In [None]:
# Create count encoding dataframe of top words again, except for unlabeled data
cv = CountVectorizer(vocabulary=words, ngram_range = (1, 2))

ohc_words_x_unlabeled = pd.SparseDataFrame(cv.fit_transform(unlabeled_articles['text']), 
                       unlabeled_articles.index,
                       cv.get_feature_names(), 
                       default_fill_value=0)

ohc_words_x_unlabeled.head()

In [None]:
# Same as above, except with the source, intervetion, and group features
features_x_unlabeled = pd.get_dummies(unlabeled_articles, columns=['source', 'intervention', 'group'])
features_x_unlabeled = features_x_unlabeled.drop(columns = ['text', 'date'])
features_x_unlabeled.head()

In [None]:
# Data matrix created here
xx_unlabeled = pd.merge(left=ohc_words_x_unlabeled, left_index=True
                  ,right=features_x_unlabeled, right_index=True,
                  how='inner')

xx_unlabeled.head()

In [None]:
# Use our model to predict whether the articles pertained to domestic vs non-domestic isues
values = NA_model.predict(xx_unlabeled)
values


## Label the unlabeled placebo data 

In [None]:
ohc_words_placebo = pd.SparseDataFrame(cv.fit_transform(placebo_articles['text']), 
                       index = placebo_articles.index,
                       columns = cv.get_feature_names(), 
                       default_fill_value=0)

In [None]:
# Find the column names that only existed in the left dataframe 
lost_cols = list(set(xx_unlabeled.columns).difference(ohc_words_placebo.columns))

In [None]:
# For loop
for i in range(len(lost_cols)):
    
    ohc_words_placebo[lost_cols[i]] = 0


In [None]:
# Test 
set(xx_unlabeled.columns).difference(ohc_words_placebo.columns)

In [None]:
# Predict 
placebo_values= NA_model.predict(ohc_words_placebo)

## Exporting files 

In [None]:
# Export results into a csv file
# pd.DataFrame(values, xx_unlabeled.index).to_csv("/home/jae/ITS-Text-Classification/processed_data/predicted.csv")

In [None]:
# Export results into a csv file 

pd.DataFrame(placebo_values).to_csv("/home/jae/ITS-Text-Classification/processed_data/placebo_predicted.csv")