In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import nltk
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
from tqdm import tqdm

In [110]:
def check_deps():
    nltk.download('stopwords')
    nltk.download("wordnet")
    nltk.download("omw-1.4")
    return True
check_deps()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kshitijalwadhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kshitijalwadhi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kshitijalwadhi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [111]:
TRAIN_DATA_PATH = "Data/train.csv"
NUM_FOLDS = 5
FRACTION_DATA = 0.1

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [112]:
df = pd.read_csv(TRAIN_DATA_PATH)
df = df.sample(frac=FRACTION_DATA, random_state = 42)
df.head()

Unnamed: 0,profile,profession
30016,He obtained his medical school training at the...,physician
14622,In 2007 he naturalised as a British citizen ke...,composer
97157,While in Austin he worked his way up through m...,comedian
114197,Her primary research interest is in financial ...,professor
16319,"She is a graduate of NYU’s Science, Health and...",journalist


In [113]:
# get all professions
professions = df["profession"].unique()
professions

array(['physician', 'composer', 'comedian', 'professor', 'journalist',
       'paralegal', 'nurse', 'attorney', 'teacher', 'psychologist',
       'dietitian', 'photographer', 'surgeon', 'poet', 'rapper',
       'software_engineer', 'architect', 'dentist', 'model', 'pastor',
       'filmmaker', 'painter', 'accountant', 'dj', 'yoga_teacher',
       'interior_designer', 'chiropractor', 'personal_trainer'],
      dtype=object)

In [114]:
# get counts
df["profession"].value_counts()

professor            3822
physician            1240
attorney             1104
photographer          829
nurse                 620
journalist            619
psychologist          598
teacher               509
dentist               469
surgeon               445
architect             322
painter               258
filmmaker             220
model                 219
poet                  219
software_engineer     216
accountant            177
composer              170
dietitian             136
comedian               95
pastor                 87
chiropractor           82
paralegal              63
rapper                 62
personal_trainer       58
dj                     52
yoga_teacher           48
interior_designer      47
Name: profession, dtype: int64

In [115]:
# # make every non-professor in profession
# df["profession"] = df["profession"].apply(lambda x: "non-professor" if x!="professor" else x)

In [116]:
# get classwise count of professions
df["profession"].value_counts()

professor            3822
physician            1240
attorney             1104
photographer          829
nurse                 620
journalist            619
psychologist          598
teacher               509
dentist               469
surgeon               445
architect             322
painter               258
filmmaker             220
model                 219
poet                  219
software_engineer     216
accountant            177
composer              170
dietitian             136
comedian               95
pastor                 87
chiropractor           82
paralegal              63
rapper                 62
personal_trainer       58
dj                     52
yoga_teacher           48
interior_designer      47
Name: profession, dtype: int64

In [None]:
def

In [117]:
def clean_data(df):
    profiles = df["profile"]
    cleaned_profiles = []
    for profile in tqdm(profiles):
        profile = re.sub(r'[^\w\s]', ' ', profile)
        profile = profile.lower()
        profile = profile.split()
        profile = [stemmer.stem(word) for word in profile if not word in set(stopwords)]
        profile = [lemmatizer.lemmatize(word) for word in profile if not word in set(stopwords)]
        profile = ' '.join(profile)
        cleaned_profiles.append(profile)
    df["cleaned_profile"] = cleaned_profiles
    return df

In [118]:
professions_sorted = sorted(professions)

In [119]:
def add_profession_one_hot(df):
    # add column for each profession
    df["presence"] = 1
    for i, row in tqdm(df.iterrows()):
        presence_list = [1] * len(professions_sorted)    
        profile = row["profile"]
        for w in profile.split():
            w = w.lower()
            if w in professions:
                presence_list[professions_sorted.index(w)] += 1
        df.loc[i, ] = presence_list 
    return df

In [120]:
df = clean_data(df)

100%|██████████| 12786/12786 [00:09<00:00, 1407.50it/s]


In [121]:
# df = add_profession_one_hot(df)

In [122]:
df

Unnamed: 0,profile,profession,cleaned_profile
30016,He obtained his medical school training at the...,physician,obtain medic school train univers texa medic b...
14622,In 2007 he naturalised as a British citizen ke...,composer,2007 naturalis british citizen keep spanish ci...
97157,While in Austin he worked his way up through m...,comedian,austin work way major venu showcas produc mult...
114197,Her primary research interest is in financial ...,professor,primari research interest financi account part...
16319,"She is a graduate of NYU’s Science, Health and...",journalist,graduat nyu scienc health environment report p...
...,...,...,...
66210,He holds an MA in English literature from Sout...,professor,hold english literatur south dakota state univ...
123466,"As a musician herself, Renuka realizes the imp...",attorney,musician renuka realiz import intellectu prope...
60158,She graduated with honors from University Of I...,nurse,graduat honor univers illinoi chicago health s...
91322,He specializes in creating imaginative and tho...,photographer,special creat imagin thought imag sell product...


In [123]:
vectorizer = TfidfVectorizer(ngram_range = (1,2), max_features=10000, sublinear_tf=True)
vectorizer.fit_transform(df["cleaned_profile"])

<12786x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 469439 stored elements in Compressed Sparse Row format>

In [124]:
# get number of features in vectorizer
len(vectorizer.get_feature_names())



10000

In [125]:
def sample_data(df):
    X, y = df.drop("profession", axis=1), df["profession"]
    over = RandomOverSampler()
    X_sampled, y_sampled = over.fit_resample(X, y)
    X_sampled["profession"] = y_sampled
    return X_sampled

In [126]:
def get_X_and_y(data_df):
    y = data_df["profession"]
    X_vec = vectorizer.transform(data_df["cleaned_profile"])
    return X_vec, y
X_vec, y = get_X_and_y(df)

In [127]:
kf = KFold(n_splits=NUM_FOLDS, shuffle=False)
train_index, val_index = next(kf.split(df))
train_data = df.iloc[train_index]
val_data = df.iloc[val_index]

In [130]:
# def combine_high_occ_professions(df):
#     professor_df = df[df["profession"]=="professor"]
#     # empty df
#     profile_arr = professor_df["profile"].values
#     cleaned_profile_arr = professor_df["cleaned_profile"].values
#     profession_arr = professor_df["profession"].values

#     profile_arr = np.array([profile_arr[i:i+3] for i in range(0, len(profile_arr), 3)])
#     cleaned_profile_arr = np.array([cleaned_profile_arr[i:i+3] for i in range(0, len(cleaned_profile_arr), 3)])
#     profession_arr = np.array([profession_arr[i:i+3] for i in range(0, len(profession_arr), 3)])


In [131]:
# get value counts of professions
df["profession"].value_counts()

professor            3822
physician            1240
attorney             1104
photographer          829
nurse                 620
journalist            619
psychologist          598
teacher               509
dentist               469
surgeon               445
architect             322
painter               258
filmmaker             220
model                 219
poet                  219
software_engineer     216
accountant            177
composer              170
dietitian             136
comedian               95
pastor                 87
chiropractor           82
paralegal              63
rapper                 62
personal_trainer       58
dj                     52
yoga_teacher           48
interior_designer      47
Name: profession, dtype: int64

In [None]:
def augment_data(df, profession, target_num):
    profiles = df[df["profession"]==profession]["profile"].values
    # split 
    

In [None]:
train_data = sample_data(train_data)

In [107]:
# shuffle data
train_data = train_data.sample(frac=1, random_state = 42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state = 42).reset_index(drop=True)

In [35]:
X_train_vec, y_train = get_X_and_y(train_data)

In [44]:
model = MultinomialNB(alpha=0.7).fit(X_train_vec, y_train)

In [45]:
X_val_vec, y_val = get_X_and_y(val_data)

y_pred = model.predict(X_val_vec)

print(metrics.accuracy_score(y_val, y_pred))

0.5977326035965598


In [280]:
# calculate micro F1 and macro F1
print("Micro F1: ", metrics.f1_score(y_val, y_pred, average='micro'))
print("Macro F1: ", metrics.f1_score(y_val, y_pred, average='macro'))

Micro F1:  0.7130570758405004
Macro F1:  0.6309011590186797


In [281]:
# get average
print("Average F1: ", (metrics.f1_score(y_val, y_pred, average='micro') + metrics.f1_score(y_val, y_pred, average='macro'))/2)

Average F1:  0.67197911742959


In [285]:
# get classwise accuracy
print(metrics.classification_report(y_val, y_pred))

                   precision    recall  f1-score   support

       accountant       0.57      0.77      0.66        35
        architect       0.58      0.60      0.59        70
         attorney       0.87      0.82      0.84       253
     chiropractor       0.55      0.79      0.65        14
         comedian       0.48      0.61      0.54        18
         composer       0.63      0.76      0.69        34
          dentist       0.90      0.81      0.85        89
        dietitian       0.56      0.80      0.66        25
               dj       0.75      0.38      0.50         8
        filmmaker       0.61      0.73      0.67        48
interior_designer       0.75      0.60      0.67        10
       journalist       0.56      0.63      0.59       118
            model       0.94      0.75      0.83        40
            nurse       0.69      0.72      0.70       119
          painter       0.52      0.80      0.63        41
        paralegal       0.60      0.23      0.33       

In [293]:
# get prediction distribution when original label is professor
for occ in professions:
    print("#####################")
    print("Actual label: ", occ)
    print("#####################")
    selected_pred = y_pred[y_val==occ]
    print(pd.Series(selected_pred).value_counts())
    print()

#####################
Actual label:  physician
#####################
physician           170
professor            18
surgeon              17
nurse                12
psychologist          6
dietitian             6
journalist            4
dentist               4
chiropractor          3
personal_trainer      2
filmmaker             1
attorney              1
photographer          1
rapper                1
pastor                1
dtype: int64

#####################
Actual label:  composer
#####################
composer     26
filmmaker     3
rapper        1
teacher       1
painter       1
pastor        1
professor     1
dtype: int64

#####################
Actual label:  comedian
#####################
comedian        11
photographer     2
psychologist     1
dietitian        1
composer         1
model            1
poet             1
dtype: int64

#####################
Actual label:  professor
#####################
professor            580
surgeon               23
software_engineer     22
psyc

In [284]:
train_data_orig = df.iloc[train_index]
X_train_vec_orig, y_train_orig = get_X_and_y(train_data_orig)
y_pred_train_orig = model.predict(X_train_vec_orig)
print("Average F1: ", (metrics.f1_score(y_train_orig, y_pred_train_orig, average='micro') + metrics.f1_score(y_train_orig, y_pred_train_orig, average='macro'))/2)

Average F1:  0.8708453703849006


In [286]:
print(metrics.classification_report(y_train_orig, y_pred_train_orig))

                   precision    recall  f1-score   support

       accountant       0.73      0.99      0.84       142
        architect       0.83      0.94      0.88       252
         attorney       0.93      0.92      0.92       851
     chiropractor       0.66      1.00      0.80        68
         comedian       0.88      1.00      0.93        77
         composer       0.84      1.00      0.91       136
          dentist       0.93      0.88      0.91       380
        dietitian       0.80      0.97      0.88       111
               dj       0.94      1.00      0.97        44
        filmmaker       0.85      1.00      0.92       172
interior_designer       0.93      1.00      0.96        37
       journalist       0.79      0.93      0.85       501
            model       0.98      0.98      0.98       179
            nurse       0.82      0.81      0.82       501
          painter       0.79      0.99      0.88       217
        paralegal       0.88      1.00      0.93       

In [17]:
from sklearn.svm import LinearSVC

In [18]:
model = LinearSVC(C=0.1, penalty="l2", dual=False, max_iter=10000).fit(X_train_vec, y_train)

In [19]:
X_val_vec, y_val = get_X_and_y(val_data)

y_pred = model.predict(X_val_vec)

print(metrics.accuracy_score(y_val, y_pred))

0.7693510555121188


In [21]:
# calculate micro F1 and macro F1
print("Micro F1: ", metrics.f1_score(y_val, y_pred, average='micro'))
print("Macro F1: ", metrics.f1_score(y_val, y_pred, average='macro'))

Micro F1:  0.7693510555121188
Macro F1:  0.6846925785709239


In [20]:
# get average
print("Average F1: ", (metrics.f1_score(y_val, y_pred, average='micro') + metrics.f1_score(y_val, y_pred, average='macro'))/2)

Average F1:  0.7270218170415214


In [22]:
# get prediction distribution when original label is professor
for occ in professions:
    print("#####################")
    print("Actual label: ", occ)
    print("#####################")
    selected_pred = y_pred[y_val==occ]
    print(pd.Series(selected_pred).value_counts())
    print()

#####################
Actual label:  physician
#####################
physician           191
surgeon              16
nurse                 9
professor             7
attorney              4
psychologist          4
chiropractor          4
dietitian             4
journalist            2
dentist               2
personal_trainer      1
architect             1
pastor                1
filmmaker             1
dtype: int64

#####################
Actual label:  composer
#####################
composer        27
psychologist     2
poet             2
rapper           1
journalist       1
dj               1
dtype: int64

#####################
Actual label:  comedian
#####################
comedian        11
teacher          2
psychologist     1
dietitian        1
model            1
poet             1
journalist       1
dtype: int64

#####################
Actual label:  professor
#####################
professor            595
psychologist          27
teacher               18
nurse                 17
p

In [23]:
# classwise accuracy
print(metrics.classification_report(y_val, y_pred))

                   precision    recall  f1-score   support

       accountant       0.66      0.77      0.71        35
        architect       0.65      0.70      0.68        70
         attorney       0.83      0.87      0.85       253
     chiropractor       0.52      0.79      0.63        14
         comedian       0.85      0.61      0.71        18
         composer       0.66      0.79      0.72        34
          dentist       0.93      0.98      0.95        89
        dietitian       0.60      0.72      0.65        25
               dj       0.33      0.12      0.18         8
        filmmaker       0.72      0.81      0.76        48
interior_designer       0.67      0.60      0.63        10
       journalist       0.63      0.68      0.65       118
            model       0.76      0.85      0.80        40
            nurse       0.73      0.82      0.77       119
          painter       0.64      0.88      0.74        41
        paralegal       0.86      0.46      0.60       

In [26]:
modelNB = MultinomialNB(alpha=0.7).fit(X_train_vec, y_train)

In [27]:
X_val_vec, y_val = get_X_and_y(val_data)
y_pred = modelNB.predict_proba(X_val_vec)

In [34]:
y_pred[34]

array([1.61451516e-05, 1.14555535e-04, 1.41914486e-04, 6.58579909e-07,
       2.76868991e-06, 2.30536118e-05, 4.18452365e-07, 4.37936328e-06,
       9.80430998e-07, 2.36576495e-04, 1.05990511e-06, 3.40247891e-03,
       4.54134197e-06, 2.12569229e-06, 8.00073639e-06, 2.63415476e-06,
       9.77170195e-07, 6.48872660e-07, 8.83181342e-05, 4.20040483e-06,
       2.78143100e-03, 9.92608402e-01, 1.58312434e-04, 1.24330630e-06,
       1.37907712e-04, 1.31935576e-06, 2.54775713e-04, 1.72086514e-07])