In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
%run "Import Libraries.ipynb"

In [5]:
class setup_data:
    """Import and set up data"""

    def __init__(self, file_path, file_tab, data_label, features, stage, target_selected):
        """Initialize the file's parameters"""
        
        self.file_path = file_path
        self.file_tab = file_tab
        self.data_label = data_label
        self.features = features
        self.stage = stage
        self.target_selected = target_selected

    def import_data(self):
        """Import the specified file"""
        df = pd.read_excel(self.file_path, sheet_name = self.file_tab)

        return df
    
    def add_categorical(self, df):
        """Encode the categorical variables"""
        if self.stage == 'training':
            label_encoder = LabelEncoder()
            df_labels = label_encoder.fit_transform(df.eval(self.data_label)) # converted string initialized to a variable
            #df = pd.concat([df, pd.DataFrame(df_labels).rename(columns={0: 'category_id'})], axis = 1, copy=False) # do not duplicate column
            df[self.target_selected] = df_labels

        return df
    
    def alter_data(self, df):
        """Omit null records"""
        
        if self.stage == 'training':
            df_col_filter = self.features
            df_col_filter.append(self.data_label)
            df = df[df_col_filter]  # filtered columns
            df = df[pd.notnull(df[self.features])]  # remove nulls

        return df
        
    def map_dict(self, df):
        """Dictionary of features in categorical form"""
        category_id_df = df[[self.data_label, 'category_id']].drop_duplicates().sort_values('category_id')
        id_to_category = dict(category_id_df[['category_id', self.data_label]].values)

        return id_to_category, category_id_df
    
    def replace_value(self, data):
        """Helper function"""
        mapped = []
        x = lambda i: mapped.append(id_to_category[data[i]])
        for i in range(len(data)):
            x(i)

        return mapped
    
class visualize_data:
    """Display plots"""
    
    def __init__(self, data_label, plot_feature_count, stage):
        self.data_label = data_label
        self.plot_feature_count = plot_feature_count
        self.stage = stage
    
    def target_val_freq(self, df):
        """Frequency plot"""
        figure = plt.figure(figsize=(6,6))
        plot = df.groupby(self.data_label)[plot_feature_count].count().plot.bar(ylim=0)
        
        return plot
    

    def confusion_plot(self, y_test, y_pred):
        """Display confusion matrix"""

        data = confusion_matrix(y_test, y_pred)

        if self.stage.lower() == "training":

            df_cm = pd.DataFrame(data, columns=np.unique(attribute_df.replace_value(list(set(y_test)))), index = np.unique(attribute_df.replace_value(list(set(y_test))))) 

        if self.stage.lower() == "validation":

            df_cm = pd.DataFrame(data, columns=np.unique(list(set(y_test))), index = np.unique(list(set(y_test))))      

        df_cm.index.name = 'Actual'
        df_cm.columns.name = 'Predicted'

        plt.figure(figsize = (12,12))
        sns.set(font_scale=1.4) #for label size
        sns.heatmap(df_cm, fmt='d', annot=True,annot_kws={"size": 16}) # font size
        print("\n Accuracy Score: " + str(round(accuracy_score(y_test, y_pred)*100, 2)) + " % (" + str(sum(y_pred==y_test)) + " out of " + str(len(y_test)) + " Correct)" + "\n")

class model_data:
    """Apply models"""
    
    def __init__(self, feature_selected, target_selected, data_label, test_size):        
        self.feature_selected = feature_selected
        self.target_selected = target_selected
        self.data_label = data_label
        self.test_size = test_size
    
    def data_transformation(self, df):
        """Transform features"""
        tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
        features = tfidf.fit_transform(df.eval(self.feature_selected)).toarray()
        labels = df.eval(self.target_selected)
        features.shape
        
        return labels, features, tfidf
    
    def model_selection(self, df):
        """Model selection"""
        tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
        features = tfidf.fit_transform(df.eval(self.feature_selected)).toarray()
        labels = df.eval(self.target_selected)
        features.shape
        
        X_train, X_test, y_train, y_test = train_test_split(df[self.feature_selected], df[self.data_label], test_size = self.test_size, random_state = 100)
        count_vect = CountVectorizer()
        X_train_counts = count_vect.fit_transform(X_train)
        tfidf_transformer = TfidfTransformer()
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
        clf = MultinomialNB().fit(X_train_tfidf, y_train)
        
        models = [
            RandomForestClassifier(n_estimators=200, max_depth=3, random_state=100),
            LinearSVC(),
            MultinomialNB(),
            LogisticRegression(random_state=100),
        ]

        CV = 5
        cv_df = pd.DataFrame(index=range(CV * len(models)))
        entries = []

        for model in models:

            model_name = model.__class__.__name__
            accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)

            for fold_idx, accuracy in enumerate(accuracies):
                entries.append((model_name, fold_idx, accuracy))
                cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
                
        sns.boxplot(x='model_name', y='accuracy', data=cv_df)
        sns.stripplot(x='model_name', y='accuracy', data=cv_df, size=8, jitter=True, edgecolor="gray", linewidth=2)
        plt.show()
        print(cv_df.groupby('model_name').accuracy.mean())
        
    def model_selected(self, df, features, labels):
        """Return the final model"""
        model = LinearSVC()
        X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=self.test_size, random_state=100)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        return y_pred, y_test, model
    