In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import nltk
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
from tqdm import tqdm

In [29]:
def check_deps():
    nltk.download('stopwords')
    nltk.download("wordnet")
    nltk.download("omw-1.4")
    return True
check_deps()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kshitijalwadhi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kshitijalwadhi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/kshitijalwadhi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [30]:
TRAIN_DATA_PATH = "Data/train.csv"
NUM_FOLDS = 5
FRACTION_DATA = 0.1

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [31]:
df = pd.read_csv(TRAIN_DATA_PATH)
df = df.sample(frac=FRACTION_DATA, random_state = 42)
df.head()

Unnamed: 0,profile,profession
30016,He obtained his medical school training at the...,physician
14622,In 2007 he naturalised as a British citizen ke...,composer
97157,While in Austin he worked his way up through m...,comedian
114197,Her primary research interest is in financial ...,professor
16319,"She is a graduate of NYU’s Science, Health and...",journalist


In [32]:
# get all professions
professions = df["profession"].unique()
professions

array(['physician', 'composer', 'comedian', 'professor', 'journalist',
       'paralegal', 'nurse', 'attorney', 'teacher', 'psychologist',
       'dietitian', 'photographer', 'surgeon', 'poet', 'rapper',
       'software_engineer', 'architect', 'dentist', 'model', 'pastor',
       'filmmaker', 'painter', 'accountant', 'dj', 'yoga_teacher',
       'interior_designer', 'chiropractor', 'personal_trainer'],
      dtype=object)

In [33]:
def augment_data(df, profession, target_num):
    profiles = df[df["profession"]==profession]["profile"].values
    all_sentences = []
    for profile in profiles:
        sentences = profile.split(".")
        sentences = [sentence for sentence in sentences if len(sentence)>10]
        all_sentences.extend(sentences)
    new_profiles = []
    np.random.shuffle(all_sentences)
    for i in range(target_num - len(profiles)):
        new_profile = ".".join(np.random.choice(all_sentences, 2))
        new_profiles.append(new_profile)
    new_df = pd.DataFrame({"profile": new_profiles, "profession": profession})
    combined_df = pd.concat([df, new_df])
    return combined_df

In [34]:
profiles = df[df["profession"]==professions[0]]["profile"].values
sum = 0
for profile in profiles:
    sentences = profile.split(".")
    sentences = [sentence for sentence in sentences if len(sentence)>10]
    print(sentences)
    print("######")
    sum += (len(sentences))
print(sum/len(profiles))

['He obtained his medical school training at the University of Texas Medical Branch School of Medicine and performed his residency at a hospital affiliated with the University of Texas Southwestern Medical Center at Dallas', ' Brotherman has indicated that his clinical interests include comprehensive ophthalmology and cataracts', ' The average patient rating for Dr', ' Brotherman is 4', '0 stars out of 5', ' Brotherman honors Aetna EPO, Blue Cross/Blue Shield, and Blue Cross Blue Shield Bronze, as well as other insurance carriers']
######
[' Eastman is a graduate of Uniformed Services University of the Health Sciences, F', ' Edward Hébert School of Medicine', ' He is an in-network provider for United Healthcare Compass, Blue Cross/Blue Shield, and Blue Choice, as well as other insurance carriers']
######
[' Remington practices medicine in Salt Lake City, UT and specializes in Plastic Surgery', ' Remington is affiliated with Intermountain Medical Center', ' Remington speaks English, Ger

In [35]:
def clean_data(df):
    profiles = df["profile"]
    cleaned_profiles = []
    for profile in tqdm(profiles):
        profile = re.sub(r'[^\w\s]', ' ', profile)
        profile = profile.lower()
        profile = profile.split()
        profile = [stemmer.stem(word) for word in profile if not word in set(stopwords)]
        profile = [lemmatizer.lemmatize(word) for word in profile if not word in set(stopwords)]
        profile = ' '.join(profile)
        cleaned_profiles.append(profile)
    df["cleaned_profile"] = cleaned_profiles
    return df

In [36]:
professions_sorted = sorted(professions)

In [37]:
def add_profession_one_hot(df):
    # add column for each profession
    df["presence"] = 1
    for i, row in tqdm(df.iterrows()):
        presence_list = [1] * len(professions_sorted)    
        profile = row["profile"]
        for w in profile.split():
            w = w.lower()
            if w in professions:
                presence_list[professions_sorted.index(w)] += 1
        df.loc[i, ] = presence_list 
    return df

In [38]:
kf = KFold(n_splits=NUM_FOLDS, shuffle=False)
train_index, val_index = next(kf.split(df))
train_data = df.iloc[train_index]
val_data = df.iloc[val_index]

In [40]:
AUGMENT_DATA = True
if AUGMENT_DATA:
    max_num = int(max(train_data["profession"].value_counts())/5)
    for profession in tqdm(professions):
        train_data = augment_data(train_data, profession, max_num)

100%|██████████| 28/28 [00:01<00:00, 25.46it/s]


In [41]:
def sample_data(df):
    X, y = df.drop("profession", axis=1), df["profession"]
    over = RandomOverSampler()
    X_sampled, y_sampled = over.fit_resample(X, y)
    X_sampled["profession"] = y_sampled
    return X_sampled

In [42]:
train_data = sample_data(train_data)

In [43]:
train_data = clean_data(train_data)
val_data = clean_data(val_data)

100%|██████████| 84924/84924 [00:41<00:00, 2029.95it/s]
100%|██████████| 2558/2558 [00:02<00:00, 1193.53it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cleaned_profile"] = cleaned_profiles


In [44]:
vectorizer = TfidfVectorizer(ngram_range = (1,2), max_features=10000, sublinear_tf=True)
vectorizer.fit_transform(train_data["cleaned_profile"])

<84924x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 2422321 stored elements in Compressed Sparse Row format>

In [45]:
def get_X_and_y(data_df):
    y = data_df["profession"]
    X_vec = vectorizer.transform(data_df["cleaned_profile"])
    return X_vec, y

In [46]:
# def combine_high_occ_professions(df):
#     professor_df = df[df["profession"]=="professor"]
#     # empty df
#     profile_arr = professor_df["profile"].values
#     cleaned_profile_arr = professor_df["cleaned_profile"].values
#     profession_arr = professor_df["profession"].values

#     profile_arr = np.array([profile_arr[i:i+3] for i in range(0, len(profile_arr), 3)])
#     cleaned_profile_arr = np.array([cleaned_profile_arr[i:i+3] for i in range(0, len(cleaned_profile_arr), 3)])
#     profession_arr = np.array([profession_arr[i:i+3] for i in range(0, len(profession_arr), 3)])


In [47]:
# shuffle data
train_data = train_data.sample(frac=1, random_state = 42).reset_index(drop=True)
val_data = val_data.sample(frac=1, random_state = 42).reset_index(drop=True)

In [48]:
X_train_vec, y_train = get_X_and_y(train_data)

In [49]:
model = MultinomialNB(alpha=0.7).fit(X_train_vec, y_train)

In [50]:
X_val_vec, y_val = get_X_and_y(val_data)
y_pred = model.predict(X_val_vec)

In [51]:
# calculate micro F1 and macro F1
print("Micro F1: ", metrics.f1_score(y_val, y_pred, average='micro'))
print("Macro F1: ", metrics.f1_score(y_val, y_pred, average='macro'))

Micro F1:  0.7200938232994527
Macro F1:  0.6040794732079448


In [52]:
# get average
print("Average F1: ", (metrics.f1_score(y_val, y_pred, average='micro') + metrics.f1_score(y_val, y_pred, average='macro'))/2)

Average F1:  0.6620866482536987


In [53]:
# get classwise accuracy
print(metrics.classification_report(y_val, y_pred))

                   precision    recall  f1-score   support

       accountant       0.57      0.69      0.62        35
        architect       0.55      0.50      0.52        70
         attorney       0.85      0.84      0.85       253
     chiropractor       0.80      0.57      0.67        14
         comedian       0.61      0.61      0.61        18
         composer       0.60      0.74      0.66        34
          dentist       0.88      0.79      0.83        89
        dietitian       0.59      0.68      0.63        25
               dj       0.67      0.25      0.36         8
        filmmaker       0.69      0.65      0.67        48
interior_designer       1.00      0.30      0.46        10
       journalist       0.57      0.64      0.60       118
            model       0.91      0.72      0.81        40
            nurse       0.71      0.71      0.71       119
          painter       0.48      0.68      0.57        41
        paralegal       1.00      0.08      0.14       

In [54]:
# get prediction distribution when original label is professor
for occ in professions:
    print("#####################")
    print("Actual label: ", occ)
    print("#####################")
    selected_pred = y_pred[y_val==occ]
    print(pd.Series(selected_pred).value_counts())
    print()

#####################
Actual label:  physician
#####################
physician       178
professor        21
nurse            12
surgeon          10
psychologist      6
dentist           5
photographer      3
journalist        3
dietitian         3
teacher           2
poet              2
attorney          1
pastor            1
dtype: int64

#####################
Actual label:  composer
#####################
composer        25
photographer     2
filmmaker        2
rapper           1
poet             1
journalist       1
pastor           1
professor        1
dtype: int64

#####################
Actual label:  comedian
#####################
comedian        11
composer         2
architect        1
psychologist     1
dietitian        1
model            1
poet             1
dtype: int64

#####################
Actual label:  professor
#####################
professor            623
psychologist          21
surgeon               20
architect             14
poet                  14
attorney      

## On Train Data

In [None]:
train_data_orig = df.iloc[train_index]
X_train_vec_orig, y_train_orig = get_X_and_y(train_data_orig)
y_pred_train_orig = model.predict(X_train_vec_orig)
print("Average F1: ", (metrics.f1_score(y_train_orig, y_pred_train_orig, average='micro') + metrics.f1_score(y_train_orig, y_pred_train_orig, average='macro'))/2)

KeyError: 'cleaned_profile'

In [None]:
print(metrics.classification_report(y_train_orig, y_pred_train_orig))

                   precision    recall  f1-score   support

       accountant       0.73      0.99      0.84       142
        architect       0.83      0.94      0.88       252
         attorney       0.93      0.92      0.92       851
     chiropractor       0.66      1.00      0.80        68
         comedian       0.88      1.00      0.93        77
         composer       0.84      1.00      0.91       136
          dentist       0.93      0.88      0.91       380
        dietitian       0.80      0.97      0.88       111
               dj       0.94      1.00      0.97        44
        filmmaker       0.85      1.00      0.92       172
interior_designer       0.93      1.00      0.96        37
       journalist       0.79      0.93      0.85       501
            model       0.98      0.98      0.98       179
            nurse       0.82      0.81      0.82       501
          painter       0.79      0.99      0.88       217
        paralegal       0.88      1.00      0.93       

## Using SVM

In [None]:
from sklearn.svm import LinearSVC

In [None]:
model = LinearSVC(C=0.1, penalty="l2", dual=False, max_iter=10000).fit(X_train_vec, y_train)

In [None]:
X_val_vec, y_val = get_X_and_y(val_data)

y_pred = model.predict(X_val_vec)

print(metrics.accuracy_score(y_val, y_pred))

0.7693510555121188


In [None]:
# calculate micro F1 and macro F1
print("Micro F1: ", metrics.f1_score(y_val, y_pred, average='micro'))
print("Macro F1: ", metrics.f1_score(y_val, y_pred, average='macro'))

Micro F1:  0.7693510555121188
Macro F1:  0.6846925785709239


In [None]:
# get average
print("Average F1: ", (metrics.f1_score(y_val, y_pred, average='micro') + metrics.f1_score(y_val, y_pred, average='macro'))/2)

Average F1:  0.7270218170415214


In [None]:
# get prediction distribution when original label is professor
for occ in professions:
    print("#####################")
    print("Actual label: ", occ)
    print("#####################")
    selected_pred = y_pred[y_val==occ]
    print(pd.Series(selected_pred).value_counts())
    print()

#####################
Actual label:  physician
#####################
physician           191
surgeon              16
nurse                 9
professor             7
attorney              4
psychologist          4
chiropractor          4
dietitian             4
journalist            2
dentist               2
personal_trainer      1
architect             1
pastor                1
filmmaker             1
dtype: int64

#####################
Actual label:  composer
#####################
composer        27
psychologist     2
poet             2
rapper           1
journalist       1
dj               1
dtype: int64

#####################
Actual label:  comedian
#####################
comedian        11
teacher          2
psychologist     1
dietitian        1
model            1
poet             1
journalist       1
dtype: int64

#####################
Actual label:  professor
#####################
professor            595
psychologist          27
teacher               18
nurse                 17
p

In [None]:
# classwise accuracy
print(metrics.classification_report(y_val, y_pred))

                   precision    recall  f1-score   support

       accountant       0.66      0.77      0.71        35
        architect       0.65      0.70      0.68        70
         attorney       0.83      0.87      0.85       253
     chiropractor       0.52      0.79      0.63        14
         comedian       0.85      0.61      0.71        18
         composer       0.66      0.79      0.72        34
          dentist       0.93      0.98      0.95        89
        dietitian       0.60      0.72      0.65        25
               dj       0.33      0.12      0.18         8
        filmmaker       0.72      0.81      0.76        48
interior_designer       0.67      0.60      0.63        10
       journalist       0.63      0.68      0.65       118
            model       0.76      0.85      0.80        40
            nurse       0.73      0.82      0.77       119
          painter       0.64      0.88      0.74        41
        paralegal       0.86      0.46      0.60       