In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import nltk
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer
from tqdm import tqdm

In [15]:
TRAIN_DATA_PATH = "Data/train.csv"
NUM_FOLDS = 5
FRACTION_DATA = 0.1

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [16]:
def check_deps():
    nltk.download('stopwords')
    nltk.download("wordnet")
    nltk.download("omw-1.4")
    return True

In [17]:
class DataLoader:
    def __init__(self, train_data_path):
        self.df = pd.read_csv(train_data_path)
        self.vectorizer = TfidfVectorizer()
        self.sample_data()
        self.clean_data()
        _ = self.vectorizer.fit_transform(self.df["profile"])


    def sample_data(self):
        self.df = self.df.sample(frac=FRACTION_DATA)
        X,y = self.df.drop("profession", axis=1), self.df["profession"]
        over = RandomOverSampler()
        under = RandomUnderSampler()
        X_sampled, y_sampled = over.fit_resample(X, y)
        X_sampled["profession"] = y_sampled
        self.df = X_sampled
    
    def clean_data(self):
        profiles = self.df["profile"]
        cleaned_profiles = []
        for profile in tqdm(profiles):
            profile = re.sub(r'[^\w\s]', ' ', profile)
            profile = re.sub(r'\d+', ' ', profile)
            profile = profile.lower()
            profile = profile.split()
            profile = [stemmer.stem(word) for word in profile if not word in set(stopwords)]
            profile = [lemmatizer.lemmatize(word) for word in profile if not word in set(stopwords)]
            profile = ' '.join(profile)
            cleaned_profiles.append(profile)
        self.df["profile"] = cleaned_profiles

    def vectorize_data(self, data):
        return self.vectorizer.transform(data)
    
    def get_X_and_y(self,data_df):
        X = data_df.drop("profession", axis=1)
        y = data_df["profession"]
        X_vec = self.vectorize_data(data_df["profile"])
        return X_vec, y

In [18]:
check_deps()

data_loader = DataLoader(train_data_path=TRAIN_DATA_PATH)
df = data_loader.df

kf = KFold(n_splits=NUM_FOLDS)
## Currently considering only one fold here
train_index, val_index = next(kf.split(df))
train_data = df.iloc[train_index]
val_data = df.iloc[val_index]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kshitij/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kshitij/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/kshitij/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|██████████| 107744/107744 [02:27<00:00, 729.98it/s] 


In [12]:

X_train_vec, _ = data_loader.get_X_and_y(train_data)


MemoryError: Unable to allocate 21.5 GiB for an array with shape (86688, 33354) and data type float64

In [19]:
data_loader.vectorizer.get_feature_names()



['_blank',
 '_john_harrison',
 '_niptuck',
 '_wpnonc',
 '_x',
 'aa',
 'aaa',
 'aaaaj',
 'aaai',
 'aaal',
 'aaba',
 'aabha',
 'aac',
 'aacd',
 'aachen',
 'aacp',
 'aacr',
 'aacsb',
 'aad',
 'aag',
 'aagaard',
 'aagt',
 'aaha',
 'aahivm',
 'aai',
 'aalborg',
 'aalto',
 'aamft',
 'aamo',
 'aan',
 'aanchal',
 'aanp',
 'aao',
 'aap',
 'aarburg',
 'aarhu',
 'aaron',
 'aaronshom',
 'aarp',
 'aarrestad',
 'aaryan',
 'aashish',
 'aasld',
 'aasn',
 'aatf',
 'aatseel',
 'aatsp',
 'aau',
 'aauw',
 'ab',
 'aba',
 'ababa',
 'abacanurseri',
 'aback',
 'abad',
 'abakanowicz',
 'abandon',
 'abang',
 'abb',
 'abba',
 'abbey',
 'abbi',
 'abbott',
 'abc',
 'abcnew',
 'abd',
 'abdellah',
 'abdellaoui',
 'abdic',
 'abdomen',
 'abdomin',
 'abdominoplasti',
 'abduct',
 'abductor',
 'abdulazez',
 'abdurasulov',
 'abeer',
 'abel',
 'abella',
 'abelo',
 'abend',
 'abercrombi',
 'aberdeen',
 'aberdeenshir',
 'aberhart',
 'aberystwyth',
 'abet',
 'abhijeet',
 'abi',
 'abid',
 'abidi',
 'abigail',
 'abigailesman',
