In [53]:
# Pandas is used for data manipulation
import pandas as pd
# Use numpy to convert to arrays
import numpy as np
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

In [60]:
features = pd.read_csv('../Final Project/Data/model_input.csv')
df = features.copy()

In [61]:
features.head(5)


Unnamed: 0,Good Cost,Bad Cost,Good Recommendation,Bad Recommendation,Good Stability,Bad Stability,Ads,No Ads,App
0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1
2,0,0,1,1,0,0,0,0,1
3,0,0,2,0,1,0,0,0,1
4,0,0,1,1,0,0,0,0,1


In [64]:
df.loc[(df['Good Cost']>0),'Good Cost']=1
df.loc[(df['Good Cost'].isnull()),'Good Cost']=0
df.loc[(df['Bad Cost']>0),'Bad Cost']=1
df.loc[(df['Bad Cost'].isnull()),'Bad Cost']=0
df.loc[(df['Good Recommendation']>0),'Good Recommendation']=1
df.loc[(df['Good Recommendation'].isnull()),'Good Recommendation']=0
df.loc[(df['Bad Recommendation']>0),'Bad Recommendation']=1
df.loc[(df['Bad Recommendation'].isnull()),'Bad Recommendation']=0
df.loc[(df['Good Stability']>0),'Good Stability']=1
df.loc[(df['Good Stability'].isnull()),'Good Stability']=0
df.loc[(df['Bad Stability']>0),'Bad Stability']=1
df.loc[(df['Bad Stability'].isnull()),'Bad Stability']=0
df.loc[(df['Ads']>0),'Ads']=1
df.loc[(df['Ads'].isnull()),'Ads']=0
df.loc[(df['No Ads']>0),'No Ads']=1
df.loc[(df['No Ads'].isnull()),'No Ads']=0

In [65]:
df.head()

Unnamed: 0,Good Cost,Bad Cost,Good Recommendation,Bad Recommendation,Good Stability,Bad Stability,Ads,No Ads,App
0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1
2,0,0,1,1,0,0,0,0,1
3,0,0,1,0,1,0,0,0,1
4,0,0,1,1,0,0,0,0,1


In [66]:
# Labels are the values we want to predict
labels = np.array(df['App'])

In [67]:
# Remove the labels from the features
# axis 1 refers to the columns
features= df.drop('App', axis = 1)

In [68]:
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)


In [69]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [70]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (12511, 8)
Training Labels Shape: (12511,)
Testing Features Shape: (4171, 8)
Testing Labels Shape: (4171,)


# Random Forest

In [83]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 500, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [84]:
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.8 degrees.


In [85]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
print('mape:', mape)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

mape: [162.38844965  35.96616441  92.10150676 ...  92.10150676  41.74658494
  27.79384818]
Accuracy: 52.02 %.


# Naive Bayes

In [86]:
from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB().fit(train_features, train_labels)

In [87]:
predict_nb = model_nb.predict(test_features)

# Calculate the absolute errors
errors = abs(predict_nb - test_labels)
# Print out the mean absolute error (mae)
#print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.75 degrees.


In [160]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
#print('mape:', mape)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 77.952 %.


# SVM

In [89]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(train_features, train_labels)

#Predict the response for test dataset
predict_svm = clf.predict(test_features)

In [90]:
# Calculate the absolute errors
errors = abs(predict_svm - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.75 degrees.


In [91]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
print('mape:', mape)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

mape: [200.          33.33333333 100.         ... 100.          50.
  33.33333333]
Accuracy: 55.95 %.


# Logistic Regression

In [92]:
from sklearn.linear_model import LogisticRegression
LogReg = LogisticRegression()
LogReg.fit(train_features, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [93]:
predict_lr = LogReg.predict(test_features)

# Calculate the absolute errors
errors = abs(predict_lr - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.75 degrees.


In [51]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
print('mape:', mape)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

mape: [200.          33.33333333 100.         ... 100.          50.
  33.33333333]
Accuracy: 52.28 %.


In [94]:
test_features

array([[0, 0, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=int64)

In [157]:
test_test = [[0,0,0,0,0,1,0,0]]

In [158]:
predict_nb_test = model_nb.predict(test_test)

In [159]:
if predict_nb_test == 1:
    print('Spotify')
elif predict_nb_test == 2:
    print('Pandora')
elif predict_nb_test == 3:
    print('AppleMusic')
elif predict_nb_test == 4:
    print('AmazonMusic')
    

AppleMusic
