# "Don't Patronize Me" Machine Learning Model Experimentation

In [1]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset, Environment
from azureml.core.conda_dependencies import CondaDependencies

subscription_id = None # obscured for security
resource_group = 'CS4650'
workspace_name = '4650_Project'

workspace = Workspace(subscription_id, resource_group, workspace_name)
env = Environment.get(workspace=workspace, name="Project_4650")

dataset = Dataset.get_by_name(workspace, name="Don\'t Patronize Me")
df = dataset.to_pandas_dataframe()

## Preprocessing
Trim the dataframe of the NaN rows, change the names of the columns, and turn the labels into True/False

In [2]:
df = df[2:]
df = df.rename(columns={"Column2":"theme", "Column4":"text", "Column5":"label"})
df["label"] = df["label"] >= 2.0
df = df.dropna()
df

Unnamed: 0,theme,text,label
2,in-need,The ones in need of constant medical care are ...,False
3,immigrant,NBC and Spanish-language Univision both declin...,False
4,in-need,A second T-Home project is being launched in t...,False
5,poor-families,Camfed would like to see this trend reversed ....,True
6,refugee,Kagunga village was reported to lack necessary...,False
...,...,...,...
10633,immigrant,"To me , I am always mindful that we are dealin...",True
10634,vulnerable,Other themes included promoting the inclusion ...,False
10635,immigrant,It came as the CDU was also humiliated by the ...,False
10636,hopeless,"Those were only days of helplessness , she say...",False


Set up word embeddings and paragraph embeddings for SVM

In [3]:
# This cell is boilerplate copied from the HW3 word_embedding.ipynb
import gensim.downloader as api

def download_word2vec_embeddings():
    print("Beginning pre-trained word embedding download")
    wv = api.load("word2vec-google-news-300")
    print(f"\nLoading completed\nVocab size: {len(wv.vocab)}")
    return wv

word2vec = download_word2vec_embeddings()

Beginning pre-trained word embedding download

Loading completed
Vocab size: 3000000


In [20]:
# boilerplate tokenization as done on the HW
!pip install nltk
import nltk
from preprocess import clean_text
nltk.download('punkt')
df["tokenized"] = df["text"].apply(lambda x: nltk.word_tokenize(clean_text(x)))



[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import numpy as np
# Generate paragraph embeddings for a given text by averaging (which is what I assume they did)
def paragraph_embedding(sentence: list) -> np.ndarray:
    words = [word for word in sentence if word in word2vec.vocab]
    return np.mean(word2vec[words], axis=0)

In [6]:
from sklearn import model_selection
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(df["tokenized"],df["label"],test_size=0.2)

In [7]:
from sklearn.preprocessing import LabelEncoder
Encoder = LabelEncoder()
train_Y = Encoder.fit_transform(train_Y)
test_Y = Encoder.fit_transform(test_Y)

In [8]:
# TODO turn the train_X into a 2d sample x feature vector using paragraph_embedding
train_X_para_embed = np.array([paragraph_embedding(text) for text in train_X])
test_X_para_embed = np.array([paragraph_embedding(text) for text in test_X])
train_X_para_embed.shape

(8508, 300)

## SVM-WV

In [14]:
from sklearn.svm import SVC

#Model based on hyperperameters found here https://aclanthology.org/2020.coling-main.518.pdf
SVM = SVC(C=100, kernel="poly", gamma="scale")
SVM.fit(train_X_para_embed,train_Y)
predictions_SVM = SVM.predict(test_X_para_embed)


In [15]:
from sklearn.metrics import classification_report
print(classification_report(predictions_SVM, test_Y))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94      1962
           1       0.36      0.44      0.40       165

    accuracy                           0.90      2127
   macro avg       0.66      0.69      0.67      2127
weighted avg       0.91      0.90      0.90      2127



## SVM-BoW

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
train_X_bow = vectorizer.fit_transform(df['text'])
train_X_bow

<10635x29999 sparse matrix of type '<class 'numpy.float64'>'
	with 370808 stored elements in Compressed Sparse Row format>

In [16]:
#Model based on hyperperameters found here https://aclanthology.org/2020.coling-main.518.pdf
SVM_bow = SVC(C=10, kernel="rbf", gamma="scale")
SVM_bow.fit(train_X_para_embed,train_Y)
predictions_SVM_bow = SVM_bow.predict(test_X_para_embed)

In [17]:
print(classification_report(predictions_SVM_bow, test_Y))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96      2047
           1       0.26      0.68      0.38        80

    accuracy                           0.92      2127
   macro avg       0.63      0.80      0.67      2127
weighted avg       0.96      0.92      0.93      2127



## Bi-Directional LSTM

In [22]:
from lstm import ClassificationModel

model = ClassificationModel()


ModuleNotFoundError: No module named 'lstm'