In [61]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [36]:
TRAIN_DATA_PATH = "Data/train.csv"

In [37]:
df = pd.read_csv(TRAIN_DATA_PATH)

In [38]:
df.head()

Unnamed: 0,profile,profession
0,"She has been working with children in camp, co...",teacher
1,He holds a PhD in Biosystems Engineering from ...,professor
2,John served as the Department Chair of Foley’s...,attorney
3,She has more 20 years of experience in the fie...,surgeon
4,Over the years Ash has built an impressive fas...,photographer


In [39]:
# get classwise distribution of data
df.groupby("profession").count()

Unnamed: 0_level_0,profile
profession,Unnamed: 1_level_1
accountant,1780
architect,3268
attorney,10552
chiropractor,837
comedian,910
composer,1835
dentist,4653
dietitian,1288
dj,494
filmmaker,2241


In [40]:
X,y = df.drop("profession", axis=1), df["profession"]

In [55]:
over = RandomOverSampler()
under = RandomUnderSampler()

X_sampled, y_sampled = under.fit_resample(X, y)

In [57]:
X_sampled["profession"] = y_sampled

In [58]:
df_sampled = X_sampled

In [59]:
# get classwise distribution of data
df_sampled.groupby("profession").count()

Unnamed: 0_level_0,profile
profession,Unnamed: 1_level_1
accountant,465
architect,465
attorney,465
chiropractor,465
comedian,465
composer,465
dentist,465
dietitian,465
dj,465
filmmaker,465


In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorize the text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_sampled["profile"])

In [83]:
kf = KFold(n_splits=5)
train_index, val_index = next(kf.split(df_sampled))
train_data = df_sampled.iloc[train_index]
val_data = df_sampled.iloc[val_index]

In [84]:
# shuffle data
train_data = train_data.sample(frac=1).reset_index(drop=True)
val_data = val_data.sample(frac=1).reset_index(drop=True)

In [85]:
X_train = train_data.drop("profession", axis=1)
y_train = train_data["profession"]

In [86]:
X_train_vec = vectorizer.transform(X_train["profile"])

In [87]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression().fit(X_train_vec, y_train)

In [88]:
# get predictions
X_val = val_data.drop("profession", axis=1)
y_val = val_data["profession"]
X_val_vec = vectorizer.transform(X_val["profile"])
y_pred = model.predict(X_val_vec)

In [90]:
# get accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_val, y_pred)

0.05069124423963134

In [94]:
temp = ""
for p in X_val["profile"]:
    print(p)
    temp = p
    break

She is a graduate of the Albany Law School at the Union University, Albany, NY. Ms. Syed is the principal of the law firm and is admitted to Southern and Eastern Federal Courts, Southern and Eastern Bankruptcy Courts, and the U.S. Tax Court. She regularly handles complex legal matters, stemming from medicaid fraud, contract claims to IRS tax cases, and H1B, EB-2 and 5 cases. Ms. Syed also is a special prosecutor for the Village of Hempstead.
