# Final Project Check-in 2018-11-16

## Group Name: The Placeholders

### Student Names


1. Aditi Sharma
2. JyotiPrakhash Maheswari
3. Joy Wenjing Qi
4. Xinke Sun
5. Zhe Yuan

In [1]:
# Import all import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from summary_fn import *

# Load Data

In [2]:
# Reading all the data files
train = pd.read_csv('train.csv')
member = pd.read_csv('members.csv',parse_dates=['registration_init_time','expiration_date'])
songs = pd.read_csv('songs.csv')
extra_song = pd.read_csv('song_extra_info.csv')
test = pd.read_csv('test.csv')

In [3]:
#train.dtypes

In [4]:
#member.dtypes

In [5]:
# Converting city and registered_via into category
member.city = member.city.astype('category')
member.registered_via = member.registered_via.astype('category')

In [6]:
#member.dtypes

In [7]:
#songs.dtypes

In [8]:
# Converting language to category
songs.language = songs.language.astype('category')

In [9]:
# Function to convert all object columns to category type
def convert_obj_cat(df):
    obj_cols = df.select_dtypes(include='object').columns
    for col in obj_cols:
        df[col] = df[col].astype('category')

In [10]:
convert_obj_cat(train)
convert_obj_cat(test)
convert_obj_cat(songs)
convert_obj_cat(member)
convert_obj_cat(extra_song)

In [11]:
# Merging all required files into a single file
train = train.merge(songs, how='left', on='song_id')
test = test.merge(songs, how='left', on='song_id')


In [12]:
train = train.merge(member, how='left', on='msno')
test = test.merge(member, how='left', on='msno')

In [13]:
train = train.merge(extra_song, how='left', on='song_id')
test = test.merge(extra_song, how='left', on='song_id')

In [14]:
#train.head()

In [15]:
#test.reset_index()
#test.head()

In [16]:
#count_na(train)

In [17]:
#count_na(test)

In [18]:
# Feature extractions
train['days_left'] = (train.expiration_date - train.registration_init_time).dt.days.astype('int')

In [19]:
train['reg_year'] = train['registration_init_time'].dt.year
train['reg_month'] = train['registration_init_time'].dt.month
train['reg_date'] = train['registration_init_time'].dt.day

train['exp_year'] = train['expiration_date'].dt.year
train['exp_month'] = train['expiration_date'].dt.month
train['exp_date'] = train['expiration_date'].dt.day

In [20]:
# Dropping expiration date and registration date columns
train.drop(columns=['registration_init_time','expiration_date'],index=1,inplace=True)

In [21]:
# Using isrc to extract years
def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan
        
train['song_year'] = train['isrc'].apply(isrc_to_year)

# Dropping isrc and name
train.drop(['isrc', 'name'], axis = 1, inplace = True)


In [22]:
# Counted features

def gener_id_count(x):
    if pd.isnull(x):
        return x
    else:
        return x.count('|')+1

def lyricist_count(x):
    if pd.isnull(x):
        return x
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def composer_count(x):
    if pd.isnull(x):
        return x
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def artist_count(x):
    if pd.isnull(x):
        return x
    else:
        return sum(map(x.count, ['|', '/', '\\', ';'])) + 1

def is_featured(x):
    if 'feat' in str(x) :
        return 1
    return 0
    

In [23]:
train['gener_count'] = train['genre_ids'].apply(gener_id_count).astype(np.int8)
train['lyricist_count'] = train['lyricist'].apply(lyricist_count).astype(np.int8)
train['composer_count'] = train['composer'].apply(composer_count).astype(np.int8)
train['artist_count'] = train['artist_name'].apply(artist_count).astype(np.int8)
train['features'] = train['artist_name'].apply(is_featured).astype(np.int8)


In [24]:
# Number of times a song is played
song_count = song_play_times(train['song_id'])

In [25]:
song_count_df = pd.DataFrame.from_dict(song_count,orient='index',columns=['counts'])
#song_count_df.head()

In [26]:
song_count_df.reset_index(level=0, inplace=True)

In [27]:
song_count_df = song_count_df.rename(columns={'index':'song_id'})
#song_count_df.head()

In [28]:
train = train.merge(song_count_df,how='left',on='song_id')

In [29]:
#check_point = train

In [30]:
#Counting artist counts similarly
artist_count = pd.DataFrame.from_dict(song_play_times(train['artist_name']),
                                      orient='index',columns=['artist_song_count']).reset_index()
artist_count = artist_count.rename(columns={'index':'artist_name'})

In [31]:
train = train.merge(artist_count,how='left',on='artist_name')

In [32]:
#train.head()

In [33]:
#count_na(train)

# Fit scikit-learn model

In [34]:
# Building models
y_train = train['target']
x_train = train.drop(['target'], axis=1)
convert_obj_cat(x_train)

In [35]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for column_name in x_train.columns:
        if x_train[column_name].dtype.name == 'category':
            x_train[column_name] = le.fit_transform(x_train[column_name].astype(str))
        else:
            pass

In [36]:
x_train = x_train.fillna(-1)

In [37]:
#x_train.dropna(inplace=True)
x_train.shape

(7377417, 30)

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.8, random_state=1)

In [39]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, oob_score=False, random_state=1,n_jobs=-1)
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

# Evaluation Metrics

In [40]:
from sklearn.metrics import accuracy_score
predicted = rf.predict(X_test)
accuracy = accuracy_score(Y_test, predicted)
print(f'Mean accuracy score: {accuracy:.3}')

Mean accuracy score: 0.693
