In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from my_GA_KNN import my_GA as my_GA_KNN
from assignments.assignment8.my_evaluation import my_evaluation
from gensim.parsing.preprocessing import remove_stopwords
import sys
sys.path.insert(0, '../..')


class my_model():

    def __init__(self):
        self.preprocessor = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, max_features=1000)
        self.preprocessorLocations = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, max_features=3)

    def obj_func(self, predictions, actuals, pred_proba=None):
        eval = my_evaluation(predictions, actuals, pred_proba)
        return [eval.f1()]

    def clean(self, X):
        descriptions = (X['description']).tolist()
        cleanDescriptions = []
        for description in descriptions:
            # remove special characters
            description = description.replace("#", "")
            description = description.replace("<", "")
            description = description.replace(">", "")
            description = description.replace("+", "")
            description = description.replace("*", "")
            description = description.replace("^", "")
            description = description.replace("=", "")
            description = description.replace("$", "")
            description = description.replace("{", "")
            description = description.replace("}", "")

            # lower case
            description = description.lower()

            # remove stop words
            description = remove_stopwords(description)
            cleanDescriptions.append(description)
        del X['description']
        X['description'] = pd.DataFrame(cleanDescriptions)
        return X

    def locations(self, X):
        locations = (X['location']).tolist()
        locationsList = []
        for location in locations:
            location = location.replace(",", "")
            location = location.lower()
            locationsList.append(location)

        locations = self.preprocessorLocations.fit_transform(locationsList)
        locations = pd.DataFrame(locations.toarray())
        return locations

    def reduce(self, X, y):
        data = pd.concat([X, y], axis=1)
        reals = data[data['fraudulent'] == 0]
        frauds = data[data['fraudulent'] == 1]

        # reduce the size of the reals by 25%
        # reals = reals.sample(round(0.75*len(reals)), replace=False)

        data = pd.concat([reals, frauds], axis=0)
        data.index = pd.RangeIndex(len(data.index))
        y = data["fraudulent"]
        del data["fraudulent"]
        X = data
        return X, y

    def fit(self, X, y):
        X = self.clean(X)
        XX = self.preprocessor.fit_transform(X["description"])
        XX = pd.DataFrame(XX.toarray())
        locations = self.locations(X)
        XX = pd.concat([XX, X['has_company_logo'], X['telecommuting'], X['has_questions'], locations], axis=1)

        XX, y = self.reduce(XX, y)

        ga = my_GA_KNN(KNeighborsClassifier, XX, y, [[3, 20], ("uniform", "distance"), ('auto', 'brute', "kd_tree")],
                       self.obj_func, generation_size=10,
                       crossval_fold=2,
                       max_generation=10, max_life=2)

        best = ga.tune()[0]
        self.clf = KNeighborsClassifier(n_neighbors=round(best[0]), weights=best[1], algorithm=best[2])
        self.clf.fit(XX, y)
        return

    def predict(self, X):
        X.index = pd.RangeIndex(len(X.index))
        X = self.clean(X)
        XX = self.preprocessor.transform(X["description"])
        XX = pd.DataFrame(XX.toarray())
        locations = self.locations(X)
        XX = pd.concat([XX, X['has_company_logo'], X['telecommuting'], X['has_questions'], locations], axis=1)

        predictions = self.clf.predict(XX)
        return predictions

ModuleNotFoundError: No module named 'my_GA_KNN'