In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
import gensim.downloader
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

nltk.download('stopwords')
stopwords_set = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Logistic Regression Classfier

In [2]:
train=pd.read_csv('10k_filings_train.csv')
test=pd.read_csv('10k_filings_test_wo_labels.csv')

Clean body and convert 'body' string to float

In [3]:
WORD_RE = re.compile(r"\b[a-z]\w+\b")
def clean(text):
  soup = BeautifulSoup(text)
  text = soup.get_text(" ")
  text = text.lower()
  text = WORD_RE.findall(text)
  cleaned_text = [w for w in text if w not in stopwords_set]
  return " ".join(cleaned_text)

train["cleaned_body"] = train["body"].map(clean)
test["cleaned_body"] = test["body"].map(clean)

In [4]:
train_vectors = gensim.downloader.load("glove-wiki-gigaword-300")
def average_embeddings(document_text):
  embeddings = []
  for word in document_text.split():
    if word in train_vectors:
      embeddings.append(train_vectors[word])
  if len(embeddings) == 0:
    return np.zeros((1, 300))
  embeddings = np.array(embeddings)
  return np.average(embeddings, axis=0).reshape(1, -1)

train_embeddings = []
for i, row in train.iterrows():
  doc_embed = average_embeddings(row.cleaned_body)
  train_embeddings.append(doc_embed)

train_embeddings = np.array(train_embeddings).squeeze()

Build the model

In [5]:
label_encoder = LabelEncoder()
train["target"] = label_encoder.fit_transform(train.label.values)

In [12]:
parameters = {"clf__C": [1.0, 0.1, 10, 5, 2, 0.0001, 100]}
pipe = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=100000))])
clf = GridSearchCV(pipe, parameters)
clf.fit(train_embeddings, train.target)
results = clf.cv_results_

all_c_scores = [value for key, value in results.items() if "split" in key]
avg_c_scores = np.average(all_c_scores, axis=0)
std_c_scores = np.std(all_c_scores, axis=0)

for param, avg, std in zip(parameters["clf__C"], avg_c_scores, std_c_scores):
    print(f"{param}\t{avg:.3} ({std:.3})")

1.0	0.757 (0.0163)
0.1	0.797 (0.019)
10	0.718 (0.0192)
5	0.724 (0.0208)
2	0.744 (0.0179)
0.0001	0.67 (0.0131)
100	0.7 (0.0166)


In [13]:
clf = GridSearchCV(pipe, {"clf__C": [0.19]})
clf.fit(train_embeddings, train.target)
predict_train_LR=clf.predict(train_embeddings)
train["predict_labels_LR"]=label_encoder.inverse_transform(predict_train_LR)
accuracy_score=(train.label,train.predict_labels_LR)

In [9]:
#clf = LogisticRegression(max_iter=100000)
#cross_validate(clf,X=train_embeddings,y=train.target,scoring='f1_weighted',return_train_score=True)

{'fit_time': array([0.53507686, 0.44483018, 0.49220967, 0.40892529, 0.40940762]),
 'score_time': array([0.0009973 , 0.00099707, 0.00099659, 0.00099635, 0.00102592]),
 'test_score': array([0.76632137, 0.77551107, 0.78632119, 0.74602049, 0.76366462]),
 'train_score': array([0.79088025, 0.78899021, 0.78937036, 0.79646189, 0.79278473])}

Predict the test data

In [14]:
test_embeddings = []
for i, row in test.iterrows():
  doc_embed = average_embeddings(row.cleaned_body)
  test_embeddings.append(doc_embed)
test_embeddings = np.array(test_embeddings).squeeze()

In [15]:
predict_test_LR = clf.predict(test_embeddings)
test["prediction_labels"] = label_encoder.inverse_transform(predict_test_LR)

In [16]:
test

Unnamed: 0,id,body,cleaned_body,prediction_labels
0,935036-2017-1,"\n BUSINESS General \n ACI Worldwide, Inc...",business general aci worldwide inc aci aci wor...,Services
1,1435049-2017-1,Business. \n Overview \n References herein to...,business overview references herein us company...,Manufacturing
2,1158449-2017-1,Business. Unless the context otherwise r...,business unless context otherwise requires adv...,Retail Trade
3,849706-2017-1,Business. \n \n \n \n \nSummary \n \n \n ...,business summary advanced environmental recycl...,Manufacturing
4,105608-2017-1,BUSINESS \n \n OVERVIEW \n \n AG&E Holdin...,business overview ag holdings inc wholly owned...,Manufacturing
...,...,...,...,...
756,1675634-2017-4,Description of Business \n \n Company Info...,description business company information incor...,Services
757,1614556-2017-4,BUSINESS Corporate History Star...,business corporate history star alliance inter...,Services
758,1557798-2017-4,Business \n \n Corporate Overview \n \n W...,business corporate overview currently shell co...,Services
759,1564709-2017-4,Business \n The Company is a holding company ...,business company holding company incorporated ...,Manufacturing


Load CSV

In [17]:
test[["id", "prediction_labels"]].to_csv("LogRegCV_Assignment3_jlin157.csv", index=False)

# Naive Bayes Classifier

In [25]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(train_embeddings, train.target)

GaussianNB()

In [24]:
#cross_validate(gnb,X=train_embeddings,y=train.target,scoring='f1_weighted',return_train_score=True)

{'fit_time': array([0.01595807, 0.01396251, 0.01296473, 0.01296496, 0.01296449]),
 'score_time': array([0.01595616, 0.01396275, 0.01396346, 0.01396298, 0.01396346]),
 'test_score': array([0.66390769, 0.68200141, 0.68383934, 0.66109405, 0.67973991]),
 'train_score': array([0.68679329, 0.69093326, 0.68519437, 0.69493132, 0.68091626])}

In [26]:
predict_train_NB = gnb.predict(train_embeddings)
train["predict_labels_NB"] = label_encoder.inverse_transform(predict_train_NB)

In [27]:
accuracy_score(train.label,train.predict_labels_NB)

TypeError: 'tuple' object is not callable

Predict the test data

In [28]:
test_NB=pd.DataFrame(columns=('id', 'prediction_labels'))
test_NB

Unnamed: 0,id,prediction_labels


In [29]:
predict_test_NB = gnb.predict(test_embeddings)

test_NB["id"]=test['id']
test_NB["prediction_labels"] = label_encoder.inverse_transform(predict_test_NB)

In [30]:
test_NB

Unnamed: 0,id,prediction_labels
0,935036-2017-1,Services
1,1435049-2017-1,Manufacturing
2,1158449-2017-1,Retail Trade
3,849706-2017-1,Wholesale Trade
4,105608-2017-1,Wholesale Trade
...,...,...
756,1675634-2017-4,Services
757,1614556-2017-4,"Finance, Insurance, And Real Estate"
758,1557798-2017-4,"Finance, Insurance, And Real Estate"
759,1564709-2017-4,Retail Trade


In [31]:
test_NB[["id", "prediction_labels"]].to_csv("NaiBay_Assignment3_jlin157.csv", index=False)

# Random Forest Classifier

In [37]:
from sklearn.ensemble import RandomForestClassifier

rfc=RandomForestClassifier()
rfc.fit(train_embeddings, train.target)

RandomForestClassifier()

In [36]:
#cross_validate(rfc,X=train_embeddings,y=train.target,scoring='f1_weighted',return_train_score=True)

{'fit_time': array([2.37031531, 2.39514756, 2.30124712, 2.33394456, 2.33283353]),
 'score_time': array([0.01595211, 0.01792765, 0.01695538, 0.01695442, 0.0169549 ]),
 'test_score': array([0.77210034, 0.76964383, 0.79733909, 0.7599964 , 0.77988987]),
 'train_score': array([1.        , 0.99958956, 0.9987652 , 0.99877213, 0.9987652 ])}

In [38]:
predict_train_rfc = rfc.predict(train_embeddings)
train["predict_labels_rfc"] = label_encoder.inverse_transform(predict_train_rfc)

In [39]:
accuracy_score(train.label,train.predict_labels_rfc)

TypeError: 'tuple' object is not callable

Predict the test data

In [40]:
test_rfc=pd.DataFrame(columns=('id', 'prediction_labels'))
test_rfc

Unnamed: 0,id,prediction_labels


In [41]:
predict_test_rfc = rfc.predict(test_embeddings)
test_rfc["id"]=test['id']
test_rfc["prediction_labels"] = label_encoder.inverse_transform(predict_test_rfc)

In [42]:
test_rfc

Unnamed: 0,id,prediction_labels
0,935036-2017-1,Services
1,1435049-2017-1,Manufacturing
2,1158449-2017-1,Retail Trade
3,849706-2017-1,Manufacturing
4,105608-2017-1,Manufacturing
...,...,...
756,1675634-2017-4,Services
757,1614556-2017-4,Services
758,1557798-2017-4,Services
759,1564709-2017-4,Manufacturing


In [43]:
test_rfc[["id", "prediction_labels"]].to_csv("RanFor_Assignment3_jlin157.csv", index=False)