## Text classification 

This part was mostly done by my wonderful RAs.

In [1]:
#Include relevant imports here
import numpy as np
import warnings
import pandas as pd
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

In [2]:
# Read in scraped articles from csv file to dataframe
articles = pd.read_csv('/home/jae/ITS-Text-Classification/raw_data/sample_articles.csv')

In [3]:
# Select relevant columns and convert date column into Datetime format
articles = articles[['text', 'source', 'group', 'date', 'intervention', 'expanding', 'distancing', 'assimilating']]
articles['date'] =  pd.to_datetime(articles['date'], format='%Y%m%d')
articles

Unnamed: 0,text,source,group,date,intervention,expanding,distancing,assimilating
0,"Third Rate Or Not, No Meddling Please! By UPEN...",News India - Times,Indian,1997-10-24,post,0.0,1.0,0.0
1,Phagwah Parade Draws Many By DHARMVIR GEHLAUT ...,News India - Times,Indian,2001-03-23,post,,,
2,"Advani Blames Congress And UF DALTONGUNJ, Biha...",News India - Times,Indian,1998-01-23,post,,,
3,Violence Feared During Assembly Elections By N...,News India - Times,Indian,2001-01-05,post,,,
4,Indrani Rahman's Final Bow By ARUN A. AGUIAR P...,News India - Times,Indian,1999-02-19,post,,,
...,...,...,...,...,...,...,...,...
1010,President Bush offers Ramadan greeting Preside...,The Arab American View,Arab,2001-12-15,pre,1.0,0.0,0.0
1011,"US officials vow to fight bigotry ""We are list...",The Arab American View,Arab,2001-11-05,pre,1.0,0.0,0.0
1012,Al-Jazeera Bureau Chief Hafez Mirazi will be t...,The Arab American View,Arab,2002-02-15,pre,1.0,0.0,0.0
1013,There are literally thousands of Arab American...,The Arab American View,Arab,2002-04-28,pre,1.0,0.0,1.0


In [4]:
# Adding column to determine if there is relevant info (expanding, distancing, assimilating) or not 
list = []
for x in articles['assimilating'].isnull().values:
    if x == True:
        list.append(0)
    else:
        list.append(1)

# Create new binary column called category based on this
# 1 indicates domestic issue, 0 indicates non-domestic issue
articles['category'] = list
articles

Unnamed: 0,text,source,group,date,intervention,expanding,distancing,assimilating,category
0,"Third Rate Or Not, No Meddling Please! By UPEN...",News India - Times,Indian,1997-10-24,post,0.0,1.0,0.0,1
1,Phagwah Parade Draws Many By DHARMVIR GEHLAUT ...,News India - Times,Indian,2001-03-23,post,,,,0
2,"Advani Blames Congress And UF DALTONGUNJ, Biha...",News India - Times,Indian,1998-01-23,post,,,,0
3,Violence Feared During Assembly Elections By N...,News India - Times,Indian,2001-01-05,post,,,,0
4,Indrani Rahman's Final Bow By ARUN A. AGUIAR P...,News India - Times,Indian,1999-02-19,post,,,,0
...,...,...,...,...,...,...,...,...,...
1010,President Bush offers Ramadan greeting Preside...,The Arab American View,Arab,2001-12-15,pre,1.0,0.0,0.0,1
1011,"US officials vow to fight bigotry ""We are list...",The Arab American View,Arab,2001-11-05,pre,1.0,0.0,0.0,1
1012,Al-Jazeera Bureau Chief Hafez Mirazi will be t...,The Arab American View,Arab,2002-02-15,pre,1.0,0.0,0.0,1
1013,There are literally thousands of Arab American...,The Arab American View,Arab,2002-04-28,pre,1.0,0.0,1.0,1


In [5]:
# Check the balance of target variables 

articles['category'].value_counts()

1    574
0    441
Name: category, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

# Conduct train-test split where 90% of data used for training
train, val = train_test_split(articles, test_size=0.1, random_state=42)

In [7]:
# Cleaning the article data by removing html, punctuation, new-line breaks (\n), and stopwords

import re
import string
!pip install nltk
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Converting all data into lowercase and defining functions to remove extraneous elemets
train['text'] = train['text'].str.lower()
val['text'] = val['text'].str.lower()
compiling = re.compile(r'<[^>]+>')
                       
def remove_html(x):
    return compiling.sub('', x)
def remove_punctuation(x):
    return x.translate(str.maketrans('','', string.punctuation))
def remove_n(x):
    return x.replace('\n', '')
removal_words = stopwords.words('english')

# Applying functions to clean train 
train['text'] = train.agg({"text": [remove_html]})
train['text'] = train.agg({'text': [remove_punctuation]})
train['text'] = train.agg({'text': [remove_n]})
train['text'] = train['text'].apply(lambda x: " ".join([y for y in x.split() if y not in removal_words]))
# Applying functions to clean test
val['text'] = val.agg({"text": [remove_html]})
val['text'] = val.agg({'text': [remove_punctuation]})
val['text'] = val.agg({'text': [remove_n]})
val['text'] = val['text'].apply(lambda x: " ".join([y for y in x.split() if y not in removal_words ]))



[nltk_data] Downloading package stopwords to /home/jae/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Count all of the words that are in NA vs not in NA
from collections import Counter
NA_count = Counter(" ".join(train[train['category'] == 0]['text']).split())
not_NA_count = Counter(" ".join(train[train['category'] == 1]['text']).split())

# Creating dataframe named testing to hold these counts
testing = pd.DataFrame({"NA": NA_count, "Not NA": not_NA_count}).fillna(0)
testing

Unnamed: 0,NA,Not NA
put,113.0,126.0
intellectual,11.0,20.0
perversity,1.0,0.0
like,431.0,493.0
whenever,12.0,17.0
...,...,...
kalifatul,0.0,1.0
twostorey,0.0,1.0
spires,0.0,2.0
onestorey,0.0,1.0


In [9]:
# Make column of times NA appeared more than not NA
testing['NA distinct'] = testing['NA'] - testing['Not NA']
testing = testing.sort_values(by = 'NA distinct', ascending = False)

# Make column of percentage of words
testing['NA percent'] = testing['NA'] / (testing['NA'] +  testing['Not NA'])
tester = testing[testing['Not NA'] > 0].sort_values(by = 'NA percent', ascending = False)

# Above work leads to creating of the top 100 words that appear in NA articles, as seen in words areray
words = tester[tester['NA'] > 100].sort_values(by = 'NA distinct', ascending = False).head(300).reset_index()['index'].values.astype(str)
words

array(['party', 'bjp', 'congress', 'minister', 'film', 'gandhi',
       'israeli', 'chief', 'says', 'films', 'sharif', 'leader', 'seats',
       'election', 'government', 'temple', 'army', 'delhi', 'prime',
       'former', 'janata', 'parties', 'may', 'alliance', 'jerusalem',
       'set', 'soldiers', 'palestinian', 'palestinians', 'front', 'power',
       'khan', 'old', 'nuclear', 'land', 'vajpayee', 'home', 'site',
       'left', 'five', 'hindu', 'forces', 'indias', 'form', 'jewish',
       'elections', 'months', 'west', 'gujarat', 'away', 'coalition',
       'majority', 'mother', 'asia', 'situation', 'governments',
       'opposition', 'christians', 'mosque', 'yet', 'man', 'four',
       'though', 'music', 'making', 'next', 'lebanon', 'back', 'better',
       'lot', 'little', 'visit', 'court', 'despite', 'far', '10',
       'movement', 'came', 'killed', 'military', 'went', 'give',
       'conference', 'within', 'known', 'british', 'past', 'took',
       'family', 'christian', 'still

In [10]:
# Create counts of how often the number of times top 100 words appeared in all articles, with 1 and 2 grams
cv = CountVectorizer(vocabulary=words, ngram_range = (1, 2))

# Encode the count vectorizer to create a dataframe holding counts
ohc_words_x_train = pd.SparseDataFrame(cv.fit_transform(train['text']), 
                       train.index,
                       cv.get_feature_names())
ohc_words_x_train

TypeError: object() takes no parameters

In [None]:
# Get addition features from one hot encoding the source, intervention, and group columns
features_x_train = pd.concat([pd.get_dummies(train[col]) for col in ['source', 'intervention', 'group']], axis=1)
features_x_train = features_x_train.drop(columns = ["The Arab American View"])
features_x_train

In [None]:
# Create final training matrix from the above two dataframes created from cells
xx_train = pd.merge(left=ohc_words_x_train, left_index=True
                  ,right=features_x_train, right_index=True,
                  how='inner')
xx_train

In [None]:
# Fit our Logistic Regression model with L1 regularization and determine the training accuracy
yy_train = train['category']
NA_model = LogisticRegressionCV(fit_intercept = True, penalty = 'l1', solver = 'saga')
NA_model.fit(xx_train, yy_train)

accuracy = NA_model.score(xx_train, yy_train)
print("Training Accuracy: ", accuracy)

In [None]:
# Repeat the above process for word count matrix in testing
cv = CountVectorizer(vocabulary=words, ngram_range = (1, 2))

ohc_words_x_test = pd.SparseDataFrame(cv.fit_transform(val['text']), 
                       val.index,
                       cv.get_feature_names(), 
                       default_fill_value=0)
ohc_words_x_test

In [None]:
# Repeat the above process for feature matrix in testing 
features_x_test = pd.concat([pd.get_dummies(val[col]) for col in ['source', 'intervention', 'group']], axis=1)
features_x_test = features_x_test.drop(columns = ["The Arab American View"])
features_x_test

In [None]:
# Merging above dataframes to get the final testing matrix 
xx_test = pd.merge(left=ohc_words_x_test, left_index=True
                  ,right=features_x_test, right_index=True,
                  how='inner')
xx_test

In [None]:
# Obtain test accuracy score from trained model
accuracy = NA_model.score(xx_test, val['category'])
print("Test Accuracy: ", accuracy)

In [None]:
# Get the unlabeled data and follow procedures in the beginning of notebook to clean
unlabeled_articles = pd.read_csv('unlabeled_articles.csv', index_col=0)
unlabeled_articles['date'] =  pd.to_datetime(unlabeled_articles['date'], format='%Y%m%d')
unlabeled_articles

In [None]:
# Categorize whether article pertains to Arab-American or Indian-American issues based on publication source
list = []
for x in unlabeled_articles['source']:
    if x == "The Arab American News":
        list.append("Arab")
    else:
        list.append("Indian")

unlabeled_articles['group'] = list
unlabeled_articles

In [None]:
# Create count encoding dataframe of top words again, except for unlabeled data
cv = CountVectorizer(vocabulary=words, ngram_range = (1, 2))

ohc_words_x_unlabeled = pd.SparseDataFrame(cv.fit_transform(unlabeled_articles['text']), 
                       unlabeled_articles.index,
                       cv.get_feature_names(), 
                       default_fill_value=0)
ohc_words_x_unlabeled

In [None]:
# Same as above, except with the source, intervetion, and group features
features_x_unlabeled = pd.get_dummies(unlabeled_articles, columns=['source', 'intervention', 'group'])
features_x_unlabeled = features_x_unlabeled.drop(columns = ['text', 'date'])
features_x_unlabeled

In [None]:
# Data matrix created here
xx_unlabeled = pd.merge(left=ohc_words_x_unlabeled, left_index=True
                  ,right=features_x_unlabeled, right_index=True,
                  how='inner')
xx_unlabeled

In [None]:
# Use our model to predict whether the articles pertained to domestic vs non-domestic isues
values = NA_model.predict(xx_unlabeled)
values

In [None]:
# Export our results into a csv file
pd.DataFrame(values, xx_unlabeled.index).to_csv("/home/jae/ITS-Text-Classification/processed_data/final_values_predicted.csv")