In [3]:
import pickle
import scipy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

In [4]:
# code used in practical 9
# for chi-squared selection

from sklearn.feature_selection import SelectKBest, chi2

def chisquared(X_train, X_test, y_train, k):
    x2 = SelectKBest(chi2, k)

    X_train_x2 = x2.fit_transform(X_train,y_train)
    X_test_x2 = x2.transform(X_test)
    return [X_train_x2, X_test_x2]

In [5]:
# code used in practical 9
# for mutual-information based selection

from sklearn.feature_selection import mutual_info_classif

def mi(X_train, X_test, y_train, k):
    mi = SelectKBest(score_func=mutual_info_classif, k=10)
    X_train_mi = mi.fit_transform(X_train,y_train)
    X_test_mi = mi.transform(X_test)
    return [X_train_mi, X_test_mi]

In [6]:
train_df = pd.read_csv(r"COMP30027_2021_Project2_datasets/recipe_train.csv")
test_df = pd.read_csv(r"COMP30027_2021_Project2_datasets/recipe_test.csv")

In [7]:
## Creating the train and test data for each feature


# creating train and test countvector for name 
countvector = CountVectorizer(stop_words = 'english')
X_train_name = countvector.fit_transform(train_df['name'])
X_test_name = countvector.transform(test_df['name'])

# creating train and test countvector for steps
stepcv = CountVectorizer(stop_words = 'english')
X_train_steps = stepcv.fit_transform(train_df['steps'])
X_test_steps = stepcv.transform(test_df['steps'])

# creating train and test countvector for ingredients
ingcv = CountVectorizer(stop_words = 'english')
X_train_ing = ingcv.fit_transform(train_df['ingredients'])
X_test_ing = ingcv.transform(test_df['ingredients'])

# creating train and test data for number of ingredients
X_ning_train = train_df['n_ingredients']
X_ning_test = test_df['n_ingredients']

# creating train and test data for number of steps
X_nsteps_train = train_df['n_steps']
X_nsteps_test = test_df['n_steps']

X_train = scipy.sparse.hstack([X_train_name, X_train_steps, X_train_ing, X_ning_train.values[:, None]])#, X_nsteps_train.values[:, None]])
X_test = scipy.sparse.hstack([X_test_name, X_test_steps, X_test_ing, X_ning_test.values[:, None]])#, X_nsteps_test.values[:, None]])

In [8]:
## Training and producing predictions


from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

# For Multinomial Naive Bayes Classifier
y_train = train_df['duration_label']
mnb = MultinomialNB().fit(X_train, y_train)
acc_mnb = mnb.predict(X_test)

# For Decision Tree Classifier
dt = DecisionTreeClassifier(max_depth=None)
dt.fit(X_train, y_train)
acc_dt = dt.predict(X_test)

# make csv file for mnb prediction
id = list(range(1,10001))
data = {'id': id, 'duration_label': acc_mnb}
mnb_prediction = pd.DataFrame(data)
mnb_prediction.reset_index(drop=True, inplace=True)
mnb_prediction.to_csv('mnb_prediction (name+steps+ingredients+n_ingredients)', index = False)
print(mnb_prediction)

# make csv file for dt prediction
data = {'id': id, 'duration_label': acc_dt}
dt_prediction = pd.DataFrame(data)
dt_prediction.reset_index(drop=True, inplace=True)
dt_prediction.to_csv('dt_prediction (name+steps+ingredients+n_ingredients)', index = False)
print(dt_prediction)

         id  duration_label
0         1             2.0
1         2             1.0
2         3             2.0
3         4             1.0
4         5             2.0
...     ...             ...
9995   9996             2.0
9996   9997             1.0
9997   9998             1.0
9998   9999             1.0
9999  10000             2.0

[10000 rows x 2 columns]
         id  duration_label
0         1             2.0
1         2             1.0
2         3             2.0
3         4             1.0
4         5             1.0
...     ...             ...
9995   9996             2.0
9996   9997             1.0
9997   9998             1.0
9998   9999             1.0
9999  10000             2.0

[10000 rows x 2 columns]
