# Proyecto Final - Deep Learning y Redes Neuronales

MIIA4406 - 201819 - Universidad de los Andes
### Integrantes:
- Jorge Eduardo Rodriguez Cardozo - 200711501
- German Augusto Carvajal Murcia -  201313516

In [None]:
import os
import pandas as pd
import numpy as np
from time import time
inicio=time()
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_auc_score,make_scorer,accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV,RandomizedSearchCV
from keras.preprocessing.image import img_to_array, load_img
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import string
import unidecode
nltk.download('wordnet')
nltk.download('stopwords')
%matplotlib inline
path = os.getcwd()
dataTraining = pd.read_csv(os.path.join(path, 'data', 'dataTraining.csv'), encoding='UTF-8', index_col=0)

In [None]:
# Print iterations progress
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'):
    import sys
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    sys.stdout.write('\r %s |%s| %s%% %s' % (prefix, bar, percent, suffix))
    sys.stdout.flush()
    # Print New Line on Complete
    if iteration == total: 
        print()

In [None]:
Img_size=(256,160) #Size of the images to import
Gray=True #True if the images to load are in grayscale, False for RGB encoding
words= 2000 #Number of text features to consider in the algorithm
usePCA = (False, 1000) #Arg1: True for using PCA over the images data for dimensionality reduction, Arg2: Number of principal components to use

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()
def split_into_lemmas(text):
    text = unidecode.unidecode(text.lower())
    words = text.split()
    nopunc = [char for char in words if char not in string.punctuation]
    clean_mess = [word for word in nopunc if word not in stopwords.words('english')]
    return [wordnet_lemmatizer.lemmatize(word).encode('ascii') for word in clean_mess]

In [None]:
vect = CountVectorizer(ngram_range=(1, 2), max_features=words, analyzer=split_into_lemmas,binary=False)
X_txt = vect.fit_transform(dataTraining['plot'])
print('Text input size: ',X_txt.shape)

In [None]:
X_image=np.empty((dataTraining.shape[0],(Img_size[0]*Img_size[1]*(3-2*Gray))))
N=dataTraining.shape[0]
n=0
for i in dataTraining.index:
    img = load_img(os.path.join(path, 'images_resize_gray', str(i) + '_resize_gray.jpeg'),target_size=Img_size,grayscale=Gray)  # this is a PIL image
    x = img_to_array(img).flatten()  # this is a Numpy array with shape (Img_size[0]*Img_size[1]*(3-2*I(Gray)))
    x = x.reshape((1,) + x.shape)
    X_image[n]=x
    n=n+1
    printProgressBar(n,N,prefix = 'Progress:', suffix = 'Complete', length = 50)

In [None]:
if usePCA[0]:
    pca=PCA()
    pca.fit(X_image)
    plt.figure(1, figsize=(10, 5))
    plt.xlabel('Num factors')
    plt.ylabel('Explained variance')
    plt.axvline(x=usePCA[1],color='red')
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    pcaT=PCA(usePCA[1])
    X_image=pcaT.fit_transform(X_image)
else:
    print('No PCA')

In [None]:
X=np.concatenate((X_txt.toarray(),X_image),axis=1)
Y_lab=dataTraining['genres']
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(pd.Series(Y_lab).map(lambda x: eval(x)))
print('Features: ',X.shape)
print('Labels: ',Y.shape)

In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.1,random_state=123)

In [None]:
pd.DataFrame(np.transpose([mlb.classes_,Y_train.sum(axis=0),Y_test.sum(axis=0)]),columns=['Class','Train','Test'])

In [None]:
h=pd.DataFrame(np.transpose([mlb.classes_,Y_train.sum(axis=0)]),columns=['Class','Count'])
classes_for_rebalance=h[h['Count']<0.05*Y_train.shape[0]].sort_values('Count').index.tolist()
h[h['Count']<0.05*Y_train.shape[0]].sort_values('Count')

In [None]:
def OverSampling(X, y, target_percentage=0.5, seed=None):
    # Assuming minority class is the positive
    n_samples = y.shape[0]
    n_samples_0 = (y == 0).sum()
    n_samples_1 = (y == 1).sum()
    n_samples_1_new =  -target_percentage * n_samples_0 / (target_percentage- 1)
    np.random.seed(seed)
    filter_ = np.random.choice(X[y == 1].shape[0], int(n_samples_1_new))
    # filter_ is within the positives, change to be of all
    filter_ = np.nonzero(y == 1)[0][filter_]
    filter_ = np.concatenate((filter_, np.nonzero(y == 0)[0]), axis=0)
    return X[filter_], y[filter_]

In [None]:
DATA=np.concatenate((X_train,Y_train),axis=1)

In [None]:
X_res=DATA
Y_res=Y_train
for i in classes_for_rebalance:
    X_res,Y_res=OverSampling(X_res,Y_res[:,i],target_percentage=0.05,seed=123)
    Y_res=X_res[:,(X_train.shape[1]):]
    print('Balanced class: ',mlb.classes_[i],' - Correct ouput: ',np.array_equal(Y_res[:,i],X_res[:,(X_train.shape[1]+i)]),' - New (X)(Y): ',X_res.shape,Y_res.shape)
X_res=X_res[:,:(X_train.shape[1])]
np.random.seed(123)
idx=np.arange(X_res.shape[0])
np.random.shuffle(idx)
Y_res=Y_res[idx]
X_res=X_res[idx]

In [None]:
pd.DataFrame(np.transpose([mlb.classes_,Y_res.sum(axis=0)]),columns=['Class','Count'])

In [None]:
print('Features: ',X_res.shape)
print('Labels: ',Y_res.shape)

In [None]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation AUC: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

#Scorer
my_roc=make_scorer(roc_auc_score,greater_is_better=True,needs_proba=True,average='macro')

In [None]:
# build a classifier
clf1 = OneVsRestClassifier(RandomForestClassifier())

# specify parameters and distributions to sample from
param_dist = {"estimator__n_estimators": sp_randint(10, 200),
              "estimator__max_depth": sp_randint(3, 10),
              "estimator__max_features": sp_randint(1, 11),
              "estimator__bootstrap": [True, False],
              "estimator__criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search=10
folds=5
random_search1 = RandomizedSearchCV(clf1, param_distributions=param_dist,cv=folds,n_iter=n_iter_search,return_train_score=True,scoring=my_roc,random_state=123)

#Report results
start = time()
random_search1.fit(X_res, Y_res)
print("RandomizedSearchCV for RandomForest took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search1.cv_results_)

In [None]:
clf1=OneVsRestClassifier(RandomForestClassifier(bootstrap=random_search1.best_params_['estimator__bootstrap'],criterion=random_search1.best_params_['estimator__criterion'],
                                               max_depth=random_search1.best_params_['estimator__max_depth'],max_features=random_search1.best_params_['estimator__max_features'],
                                               n_estimators=random_search1.best_params_['estimator__n_estimators']))
clf1.fit(X_res,Y_res)
Y_pred=clf1.predict_proba(X_test)
roc_auc_score(Y_test,Y_pred)

In [None]:
dataTesting = pd.read_csv(os.path.join(path, 'data', 'dataTesting.csv'), encoding='UTF-8', index_col=0)
X_image_pred=np.empty((dataTesting.shape[0],(Img_size[0]*Img_size[1]*(3-2*Gray))))
N=dataTesting.shape[0]
n=0
for i in dataTesting.index:
    img = load_img(os.path.join(path, 'images_resize_gray', str(i) + '_resize_gray.jpeg'),target_size=Img_size,grayscale=Gray)  # this is a PIL image
    x = img_to_array(img).flatten()  # this is a Numpy array with shape (Img_size[0]*Img_size[1]*(3-2*I(Gray)))
    x = x.reshape((1,) + x.shape)
    X_image_pred[n]=x
    n=n+1
    printProgressBar(n,N,prefix = 'Progress:', suffix = 'Complete', length = 50)
X_txt_pred = vect.transform(dataTesting['plot'])
if usePCA[0]:
    X_image_pred=pcaT.transform(X_image_pred)
X_pred=np.concatenate((X_txt_pred.toarray(),X_image_pred),axis=1)

In [None]:
pred1=pd.DataFrame(clf1.predict_proba(X_pred),columns=['p_'+s for s in mlb.classes_],index=dataTesting.index)
pred1.to_csv(path_or_buf ='OvsR_RF_CV5_Calib.csv',index_label='ID',encoding='UTF-8')

In [None]:
print("Total execution time took %.2f seconds" % ((time() - inicio)))