# Text classification 

My wonderful RAs did most of the coding. I added few comments to make what they did clear and indicate the future steps I need to take. Note that the right kernel for this code is `Python2`. 

## Importing libraries

In [1]:
#Include relevant imports here

# Data manipulation 
import numpy as np
import warnings
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

# Data visualization 
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# ML
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, balanced_accuracy_score, cohen_kappa_score, precision_score, recall_score

warnings.filterwarnings('ignore')

## Import data 

In [2]:
# Read in scraped articles from csv file to dataframe
articles = pd.read_csv('/home/jae/ITS-Text-Classification/processed_data/cleaned_text.csv')

## Feature extraction

In [3]:
from sklearn.model_selection import train_test_split

# Conduct train-test split where 90% of data used for training

train, val = train_test_split(articles, 
                              test_size = 0.1, 
                              random_state = 42) # for reproducibility

## Exploratory data analysis

In [4]:
# Count all of the words that are in NA vs not in NA
from collections import Counter
NA_count = Counter(" ".join(train[train['category'] == 0]['text']).split())
not_NA_count = Counter(" ".join(train[train['category'] == 1]['text']).split())

# Creating dataframe named testing to hold these counts
testing = pd.DataFrame({"NA": NA_count, "Not NA": not_NA_count}).fillna(0)
testing.head()

Unnamed: 0,NA,Not NA
0,3.0,0.0
0,0.0,1.0
6,0.0,1.0
1,0.0,1.0
15,1.0,0.0


In [5]:
# Make column of times NA appeared more than not NA
testing['NA distinct'] = testing['NA'] - testing['Not NA']
testing = testing.sort_values(by = 'NA distinct', ascending = False)

# Make column of percentage of words
testing['NA percent'] = testing['NA'] / (testing['NA'] +  testing['Not NA'])
tester = testing[testing['Not NA'] > 0].sort_values(by = 'NA percent', ascending = False)

# Above work leads to creating of the top 100 words that appear in NA articles, as seen in words areray
words = tester[tester['NA'] > 100].sort_values(by = 'NA distinct', ascending = False).head(300).reset_index()['index'].values.astype(str)
words[:10]

array(['party', 'bjp', 'congress', 'minister', 'film', 'gandhi',
       'israeli', 'chief', 'says', 'films'], dtype='|S13')

## Feature engineering

In [6]:
# Create counts of how often the number of times top 100 words appeared in all articles, with 1 and 2 grams
cv = CountVectorizer(vocabulary=words, ngram_range = (1, 2))

# Encode the count vectorizer to create a dataframe holding counts
ohc_words_x_train = pd.SparseDataFrame(cv.fit_transform(train['text']), 
                       index = train.index,
                       columns = cv.get_feature_names(), 
                       default_fill_value=0)

ohc_words_x_train.head()

Unnamed: 0,party,bjp,congress,minister,film,gandhi,israeli,chief,says,films,...,states,world,people,muslims,muslim,community,us,arab,said,american
883,0,5,0,0,0,0,0,0,0,0,...,0,0,0,4,5,0,0,0,2,0
589,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,2,0,1,0,1
88,0,0,0,0,0,0,0,0,0,0,...,0,1,5,0,1,0,1,0,0,0
63,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,1,2,0,18,0
886,0,0,0,0,0,0,1,0,5,0,...,0,3,2,4,3,2,2,0,0,2


In [7]:
# Get addition features from one hot encoding the source, intervention, and group columns
features_x_train = pd.concat([pd.get_dummies(train[col]) for col in ['source', 'intervention', 'group']], axis=1)
features_x_train = features_x_train.drop(columns = ["The Arab American View"])
features_x_train.head()

Unnamed: 0,India - West,India Abroad,News India - Times,The Arab American News,post,pre,Arab,Indian
883,0,1,0,0,0,1,0,1
589,0,0,0,1,0,1,1,0
88,0,0,1,0,1,0,0,1
63,0,0,1,0,1,0,0,1
886,0,1,0,0,0,1,0,1


In [8]:
# Create final training matrix from the above two dataframes created from cells
xx_train = pd.merge(left=ohc_words_x_train, left_index=True
                  ,right=features_x_train, right_index=True,
                  how='inner')
xx_train.head()

Unnamed: 0,party,bjp,congress,minister,film,gandhi,israeli,chief,says,films,...,said,american,India - West,India Abroad,News India - Times,The Arab American News,post,pre,Arab,Indian
883,0,5,0,0,0,0,0,0,0,0,...,2,0,0,1,0,0,0,1,0,1
589,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,1,1,0
88,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
63,0,0,0,0,0,0,0,0,0,0,...,18,0,0,0,1,0,1,0,0,1
886,0,0,0,0,0,0,1,0,5,0,...,0,2,0,1,0,0,0,1,0,1


## Training and testing lasso (least absolute shrinkage and selection operator)

In [9]:
# Fit our Logistic Regression model with L1 regularization and determine the training accuracy
yy_train = train['category']
NA_model = LogisticRegressionCV(fit_intercept = True, penalty = 'l1', solver = 'saga')
NA_model.fit(xx_train, yy_train)

accuracy = NA_model.score(xx_train, yy_train)
print("Training Accuracy: ", accuracy)

('Training Accuracy: ', 0.8083242059145673)


In [10]:
# Repeat the above process for word count matrix in testing
cv = CountVectorizer(vocabulary=words, 
                     ngram_range = (1, 2))

ohc_words_x_test = pd.SparseDataFrame(cv.fit_transform(val['text']), 
                       val.index,
                       cv.get_feature_names(), 
                       default_fill_value=0)

ohc_words_x_test.head()

Unnamed: 0,party,bjp,congress,minister,film,gandhi,israeli,chief,says,films,...,states,world,people,muslims,muslim,community,us,arab,said,american
752,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,2,1,0,4,0
519,0,0,0,1,0,0,1,0,0,0,...,0,10,2,3,15,1,1,7,0,1
210,4,0,2,1,0,0,0,1,0,0,...,4,0,2,0,1,0,1,0,0,0
611,1,0,0,0,0,0,0,0,0,0,...,1,0,15,0,1,5,8,0,11,5
914,0,0,0,1,0,0,2,0,2,0,...,1,0,1,6,7,2,10,1,0,3


In [11]:
# Repeat the above process for feature matrix in testing 
features_x_test = pd.concat([pd.get_dummies(val[col]) for col in ['source', 'intervention', 'group']], axis=1)
features_x_test = features_x_test.drop(columns = ["The Arab American View"])
features_x_test.head()

Unnamed: 0,India - West,India Abroad,News India - Times,The Arab American News,post,pre,Arab,Indian
752,1,0,0,0,0,1,0,1
519,0,0,0,1,0,1,1,0
210,0,1,0,0,1,0,0,1
611,0,0,1,0,0,1,0,1
914,0,1,0,0,0,1,0,1


In [12]:
# Merging above dataframes to get the final testing matrix 
xx_test = pd.merge(left=ohc_words_x_test, left_index=True
                  ,right=features_x_test, right_index=True,
                   how='inner')
xx_test.head()

Unnamed: 0,party,bjp,congress,minister,film,gandhi,israeli,chief,says,films,...,said,american,India - West,India Abroad,News India - Times,The Arab American News,post,pre,Arab,Indian
752,0,0,0,0,0,0,0,0,0,0,...,4,0,1,0,0,0,0,1,0,1
519,0,0,0,1,0,0,1,0,0,0,...,0,1,0,0,0,1,0,1,1,0
210,4,0,2,1,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,1
611,1,0,0,0,0,0,0,0,0,0,...,11,5,0,0,1,0,0,1,0,1
914,0,0,0,1,0,0,2,0,2,0,...,0,3,0,1,0,0,0,1,0,1


In [53]:

def test_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return(accuracy, precision, recall)


In [54]:
test_results = test_model(NA_model, x_test = xx_test, y_test = val['category'])


In [55]:
table = pd.DataFrame(test_results, columns = ['Scores'])

table['Metrics'] = ['Accuracy', 'Precision', 'Recall']

table

Unnamed: 0,Scores,Metrics
0,0.72549,Accuracy
1,0.753846,Precision
2,0.803279,Recall



## Label the unlabeled data 

In [57]:
# Get the unlabeled data and follow procedures in the beginning of notebook to clean
unlabeled_articles = pd.read_csv('/home/jae/ITS-Text-Classification/raw_data/unlabeled_articles.csv', index_col=0)
unlabeled_articles['date'] =  pd.to_datetime(unlabeled_articles['date'], format='%Y%m%d')
unlabeled_articles

Unnamed: 0,text,source,date,intervention
1,Bleak Fate Of Women's Reservation Bill By SOU...,News India - Times,1998-07-24,post
2,Pak Christians Protest Blasphemy Law By Our S...,News India - Times,1998-07-24,post
3,Women's bill likely to be abandoned again ...,India Abroad,1998-07-24,post
4,Kashmir's Karma: An Analysis By HARI C. POLAV...,News India - Times,1998-07-17,post
5,Exiled Writer Can't See Her Dying Mother WASH...,News India - Times,1998-07-17,post
6,Police to launch investigation against Husa...,India Abroad,1998-07-17,post
7,Opposition stalls tabling of women's bill ...,India Abroad,1998-07-17,post
8,Gujral on global panel to monitor Algeria ...,India Abroad,1998-07-17,post
9,Repeal Pak Blasphemy Laws By UPENDRA SABAT N...,News India - Times,1998-07-10,post
10,"UN Soft Toward India, Needs Viagra: Pak Envoy ...",News India - Times,1998-07-10,post


In [58]:
# Categorize whether article pertains to Arab-American or Indian-American issues based on publication source
list = []
for x in unlabeled_articles['source']:
    if x == "The Arab American News":
        list.append("Arab")
    else:
        list.append("Indian")

unlabeled_articles['group'] = list
unlabeled_articles

Unnamed: 0,text,source,date,intervention,group
1,Bleak Fate Of Women's Reservation Bill By SOU...,News India - Times,1998-07-24,post,Indian
2,Pak Christians Protest Blasphemy Law By Our S...,News India - Times,1998-07-24,post,Indian
3,Women's bill likely to be abandoned again ...,India Abroad,1998-07-24,post,Indian
4,Kashmir's Karma: An Analysis By HARI C. POLAV...,News India - Times,1998-07-17,post,Indian
5,Exiled Writer Can't See Her Dying Mother WASH...,News India - Times,1998-07-17,post,Indian
6,Police to launch investigation against Husa...,India Abroad,1998-07-17,post,Indian
7,Opposition stalls tabling of women's bill ...,India Abroad,1998-07-17,post,Indian
8,Gujral on global panel to monitor Algeria ...,India Abroad,1998-07-17,post,Indian
9,Repeal Pak Blasphemy Laws By UPENDRA SABAT N...,News India - Times,1998-07-10,post,Indian
10,"UN Soft Toward India, Needs Viagra: Pak Envoy ...",News India - Times,1998-07-10,post,Indian


In [59]:
# Create count encoding dataframe of top words again, except for unlabeled data
cv = CountVectorizer(vocabulary=words, ngram_range = (1, 2))

ohc_words_x_unlabeled = pd.SparseDataFrame(cv.fit_transform(unlabeled_articles['text']), 
                       unlabeled_articles.index,
                       cv.get_feature_names(), 
                       default_fill_value=0)
ohc_words_x_unlabeled

Unnamed: 0,party,bjp,congress,minister,film,gandhi,israeli,chief,says,films,...,states,world,people,muslims,muslim,community,us,arab,said,american
1,9,6,7,0,0,1,0,2,0,0,...,0,0,0,0,1,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,17,6,5,5,0,2,0,0,0,0,...,1,0,0,0,1,0,0,0,5,0
4,0,0,0,1,0,0,0,0,0,0,...,13,0,3,2,4,0,1,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,2,0,1,0,2,0,1,0,2,0
6,2,3,0,0,0,0,0,0,0,0,...,0,0,2,0,1,0,0,0,7,0
7,21,6,9,4,0,3,0,1,0,0,...,0,0,0,2,9,2,0,0,5,0
8,0,0,0,3,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,3,0
9,0,0,0,0,0,0,0,0,0,0,...,2,0,1,1,1,0,1,0,11,0
10,2,2,0,1,0,0,0,0,0,0,...,0,1,2,3,2,0,7,0,20,0


In [60]:
# Same as above, except with the source, intervetion, and group features
features_x_unlabeled = pd.get_dummies(unlabeled_articles, columns=['source', 'intervention', 'group'])
features_x_unlabeled = features_x_unlabeled.drop(columns = ['text', 'date'])
features_x_unlabeled

Unnamed: 0,source_India - West,source_India Abroad,source_News India - Times,source_The Arab American News,intervention_post,intervention_pre,group_Arab,group_Indian
1,0,0,1,0,1,0,0,1
2,0,0,1,0,1,0,0,1
3,0,1,0,0,1,0,0,1
4,0,0,1,0,1,0,0,1
5,0,0,1,0,1,0,0,1
6,0,1,0,0,1,0,0,1
7,0,1,0,0,1,0,0,1
8,0,1,0,0,1,0,0,1
9,0,0,1,0,1,0,0,1
10,0,0,1,0,1,0,0,1


In [61]:
# Data matrix created here
xx_unlabeled = pd.merge(left=ohc_words_x_unlabeled, left_index=True
                  ,right=features_x_unlabeled, right_index=True,
                  how='inner')
xx_unlabeled

Unnamed: 0,party,bjp,congress,minister,film,gandhi,israeli,chief,says,films,...,said,american,source_India - West,source_India Abroad,source_News India - Times,source_The Arab American News,intervention_post,intervention_pre,group_Arab,group_Indian
1,9,6,7,0,0,1,0,2,0,0,...,1,0,0,0,1,0,1,0,0,1
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
3,17,6,5,5,0,2,0,0,0,0,...,5,0,0,1,0,0,1,0,0,1
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
5,0,0,0,0,0,0,0,0,0,0,...,2,0,0,0,1,0,1,0,0,1
6,2,3,0,0,0,0,0,0,0,0,...,7,0,0,1,0,0,1,0,0,1
7,21,6,9,4,0,3,0,1,0,0,...,5,0,0,1,0,0,1,0,0,1
8,0,0,0,3,0,0,0,0,0,0,...,3,0,0,1,0,0,1,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,11,0,0,0,1,0,1,0,0,1
10,2,2,0,1,0,0,0,0,0,0,...,20,0,0,0,1,0,1,0,0,1


In [62]:
# Use our model to predict whether the articles pertained to domestic vs non-domestic isues
values = NA_model.predict(xx_unlabeled)
values

array([1, 1, 0, ..., 1, 0, 1])

## Exporting files 

In [63]:
# Export our results into a csv file
pd.DataFrame(values, xx_unlabeled.index).to_csv("/home/jae/ITS-Text-Classification/processed_data/predicted.csv")