# Choosing min_df

In this notebook we will, systematically, go about choosing min_df

## Imports

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import random
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, log_loss
from sklearn import svm #support vector machines
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

from functions import *

## Features plot

Below we show a plot of how the number of features change, as we exclude ingredients that only appear in i recipies

In [None]:
# First we make a list of dfs. 
# The are 39774 in total, so min_df = i/39774 means 
# include every ingredient that appears in at least i recipies.
df_list = [i/39774 for i in range(1,101)] 
feat_num = []

data = pd.read_json('train.json') 
recipie_list_list = data.ingredients.values.tolist()
recipie_string_list = [" ".join(ing) for ing in recipie_list_list]
del data, recipie_list_list

for df in df_list :
    vectorizer = CountVectorizer(min_df = df)
    X = vectorizer.fit_transform(recipie_string_list)
    feat_num.append(len(vectorizer.get_feature_names()))

del recipie_string_list
    
plt.plot(range(1,101), feat_num, "-")
plt.xlabel('recipies')
plt.ylabel('features/ingredients')
plt.show()


## Picking min_df using cross validation

Now we make a plot of how the accuracy of three classifications methods change, as a a function of how many reciepies an ingredient must appear in to be considered a feature

In [None]:
svm_list, log_list, forrest_list = accuracy_with_min_df(min_df_list = df_list, svm_parms = [0.001, 0.1, 1, 10], log_parms = [0.001, 0.1, 1, 10], forrest_trees = [50,100,150], forrest_depth = [10,20,None], folds = 3)

In [None]:
# Create plots 
plt.plot(range(1,101), svm_list, '--', label='SVM')
plt.plot(range(1,101), log_list, '-', label='Logistic')
plt.plot(range(1,101), forrest_list, ':', label='Random forrest')
plt.xlabel('recipies')
plt.ylabel('Accuracy/CV score')
plt.show()