# Compilation of Code Snippets for Data Science

All you need is here (and google everything else)

Good luck for the test!

---

## 1. Basic Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set() # set the default Seaborn style for graphics

## 2. Pandas DataFrame

2.1. Create a `DataFrame` from a `Dictionary`

In [2]:
canteens_dict = {"Name" : ["North Spine", "Koufu", "Canteen 9", "North Hill", "Canteen 11", "Canteen 13", "Canteen 14", "Canteen 16"],
                 "Stalls" : [20, 15, 10, 12, 8, 6, 4, 2],
                 "Rating" : [4.5, 4.2, 4.0, 3.7, 4.2, 2.9, 4.7, 4.1]
                }

canteens_df = pd.DataFrame(canteens_dict)
canteens_df

Unnamed: 0,Name,Stalls,Rating
0,North Spine,20,4.5
1,Koufu,15,4.2
2,Canteen 9,10,4.0
3,North Hill,12,3.7
4,Canteen 11,8,4.2
5,Canteen 13,6,2.9
6,Canteen 14,4,4.7
7,Canteen 16,2,4.1


2.1.1. Create a copy of the `DataFrame`

In [None]:
canteens_df2 = canteens_df.copy()

2.2. Extract rows

2.2.1. Extract a single row from a `DataFrame`

In [None]:
canteens_df.iloc[0]
canteens_df.loc[0]

2.2.2. Extract multiple rows as a `DataFrame` by index

In [None]:
canteens_df.loc[[1,3,5]]

2.2.3. Extract rows as a `DataFrame` by matching values

In [None]:
pkmndata_pred = pkmndata[pkmndata["Name"].isin(["Charizard", "Snorlax", "Vivillon"])]

2.3. Print rows from a `DataFrame`
* head() outputs the first 5 rows
* head(x) outputs the first x rows
* tail() outputs the last _ rows
* sample(x) outputs x randomly selected rows

In [None]:
canteens_df.head()
canteens_df.tail()
canteens_df.sample(3)

2.4. Check the _data type_ and _dimensions_ (rows, cols)

In [None]:
print("Data type : ", type(canteens_df))
print("Data dimensions : ", canteens_df.shape)

2.5. Check data types of variables (cols) 

In [None]:
print(canteens_df.dtypes)

2.6. Display all columns

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# reset settings
pd.reset_option('display.max_rows')

2.7. Remove rows/columns

2.7.1. Remove columns

In [None]:
df.drop(columns=['Id', 'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'MoSold', 'YrSold', 'GarageYrBlt'], inplace=True)

2.8. Convert a variable into "category" data type

In [None]:
# Change to category data type: By converting an existing column to a category dtype:
houseCatData['MSSubClass'] = houseCatData['MSSubClass'].astype("category")

# Change to category data type: By specifying dtype when constructing a Series
msc = pd.Series(houseCatData['MSSubClass'], dtype="category")

## 3. Import file into a DataFrame

3.1. CSV

In [None]:
csv_data = pd.read_csv('somedata.csv', header = None)

3.2. TXT

In [None]:
txt_data = pd.read_table('somedata.txt', sep = "\s+", header = None)

3.3. XLS

In [None]:
xls_data = pd.read_excel('somedata.xlsx', sheet_name = 'Sheet1', header = None)

3.4. JSON

In [None]:
json_data = pd.read_json('somedata.json')

3.5. HTML

- For use if the dataset is in a table formal within an HTML website.

In this example, we try to get the cast of Kung-Fu Panda from http://www.imdb.com/title/tt0441773/fullcredits/?ref_=tt_ov_st_sm

html_data will be a list of DataFrames, each DataFrame derived from a table within that HTML

In [None]:
html_data = pd.read_html('http://www.imdb.com/title/tt0441773/fullcredits/?ref_=tt_ov_st_sm')
print("HTML tables : ", len(html_data))
print(type(html_data[2]))
html_data[2].head()

## 4. Statistics for Variables in a DataFrame

for Numeric Data

### 4.1. Uni-Variate Statistics

4.1.1. Extract a single variable as a `DataFrame`

In [None]:
hp = pd.DataFrame(pkmndata['HP'])

4.1.2. Check _data type_ and _size of dataset_ (rows)

In [None]:
print("Data type : ", type(hp))
print("Data dims : ", hp.size)

4.1.3. Check summary statistics

In [None]:
hp.describe()

### 4.2. Multi-Variate Statistics

4.2.1. Extract multiple variables to a `DataFrame`

In [None]:
numDF = pd.DataFrame(pkmndata[["HP", "Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed"]])

4.2.2. Check summary statistics

In [None]:
numDF.describe()

4.2.3. Check information about the `DataFrame` and its variables

In [None]:
numDF.info()

4.2.4. Check 'skewness' of each variable's distribution

In [None]:
numDF.skew()

## 5. Visualisation with Plots

for Numeric data

### 5.1. Uni-Variate Plots

5.1.1. Boxplot

In [None]:
f = plt.figure(figsize=(24, 4))
sb.boxplot(data = hp, orient = "h")

5.1.2. Histogram

In [None]:
f = plt.figure(figsize=(16, 8))
sb.histplot(data = hp)

5.1.3. Kernel Density Estimate (KDE)

In [None]:
f = plt.figure(figsize=(16, 8))
sb.kdeplot(data = hp)

5.1.4. Histogram and KDE

In [None]:
f = plt.figure(figsize=(16, 8))
sb.histplot(data = hp, kde = True)

5.1.5. Violin Plot - combines boxplot and KDE

In [None]:
f = plt.figure(figsize=(16, 8))
sb.violinplot(data = hp, orient = "h")

### 5.2. Bi-Variate Plots

5.2.1. Jointplot
* concatenate two variables together first

In [None]:
jointDF = pd.concat([attack, hp], axis = 1).reindex(attack.index)
sb.jointplot(data = jointDF, x = "Attack", y = "HP", height = 12)

5.2.2. Heatmap: shows correlation between two variables

In [None]:
jointDF.corr() # just a normal DataFrame of the correlation

sb.heatmap(jointDF.corr(), vmin = -1, vmax = 1, annot = True, fmt=".2f")

5.2.3. **Subplots** for single variables in individual `DataFrames`

5.2.3.1. 1x3

In [None]:
f, axes = plt.subplots(1, 3, figsize=(24, 6))
sb.boxplot(data = total_train, orient = "h", ax = axes[0])
sb.histplot(data = total_train, ax = axes[1])
sb.violinplot(data = total_train, orient = "h", ax = axes[2])

5.2.3.2. 2x3

In [None]:
# Set up matplotlib figure with three subplots (2 rows, 3 cols)
f, axes = plt.subplots(2, 3, figsize=(24, 12))

# Plot the basic uni-variate figures for HP
sb.boxplot(data = hp, orient = "h", ax = axes[0,0])
sb.histplot(data = hp, ax = axes[0,1])
sb.violinplot(data = hp, orient = "h", ax = axes[0,2])

# Plot the basic uni-variate figures for Attack
sb.boxplot(data = attack, orient = "h", ax = axes[1,0])
sb.histplot(data = attack, ax = axes[1,1])
sb.violinplot(data = attack, orient = "h", ax = axes[1,2])

### 5.3. Multi-Variate Plots

5.3.1. **Subplots** for a multi-variate `DataFrame`

In [None]:
# Draw the distributions of all variables (6 rows, 3 cols)
f, axes = plt.subplots(6, 3, figsize=(18, 24))

count = 0
for var in numDF:
    sb.boxplot(data = numDF[var], orient = "h", ax = axes[count,0])
    sb.histplot(data = numDF[var], ax = axes[count,1])
    sb.violinplot(data = numDF[var], orient = "h", ax = axes[count,2])
    count += 1

5.3.2. Correlation Matrix & Heatmap

5.3.2.1. Whole heatmap (includes all variables in dataframe)

In [None]:
numDF.corr() # just a normal DataFrame of the correlation matrix

f = plt.figure(figsize=(12, 12))
sb.heatmap(numDF.corr(), vmin = -1, vmax = 1, annot = True, fmt = ".2f")

5.3.2.2. Partial heatmap (only specified variables in groupby included)

In [None]:
f = plt.figure(figsize=(17, 12))
sb.heatmap(houseCatData.groupby(['MSSubClass', 'OverallQual']).size().unstack(),\
              linewidths=1, annot=True, annot_kws={"size": 18}, cmap="BuGn")

5.3.3. Pairplot: Jointplot but for >2 variables

In [None]:
sb.pairplot(data = numDF)

### 5.4. Categorical Plots

5.4.1. Catplot: Number of data points in each category

In [None]:
sb.catplot(y = "Type 1", data = pkmndata, kind = "count", height = 8)

5.4.2. Catplot: Multiple plots of count in each category across a 2nd category

In this case:
* 1st category = Type 1
* 2nd category = Generation

In [None]:
sb.catplot(y = 'Type 1', data = pkmndata, col = 'Generation', kind = 'count', col_wrap = 2, height = 8)

5.4.3. Heatmap of the distribution (count) of 2 categories

In [None]:
f = plt.figure(figsize=(20, 20))
sb.heatmap(dualtype_data.groupby(['Type 1', 'Type 2']).size().unstack(), 
           linewidths = 1, annot = True, annot_kws = {"size": 18}, cmap = "BuGn")

5.4.4. Boxplot: a boxplot for each category

In [None]:
# x is the numeric data, y is the categorical data
f = plt.figure(figsize=(18, 6))
sb.boxplot(x = "Total", y = "Legendary", data = trainDF, orient = "h")

5.4.5. Swarmplot: a swarmplot for each category

In [None]:
# x is the numeric data, y is the categorical data
f = plt.figure(figsize=(18, 6))
sb.swarmplot(x = "Total", y = "Legendary", data = trainDF, orient = "h")

## 6. Categorical Data

6.1. Number of Categories

In [None]:
print("Number of Categories:", len(pkmndata["Generation"].unique()))

6.2. Number of data points in each category

In [None]:
print(pkmndata["Generation"].value_counts())

6.2.1. Ratio of data points of each category

In [None]:
df.value_counts(normalize=True)

## 7. Data Cleaning

#### 7.1. NULL/NA values

7.1.1. Remove NA values or Fill NA values with default

In [None]:
pkmndata["Type 2"].dropna()
pkmndata["Type 2"].fillna(value='hello', inplace=True)

7.1.2. Extract null / non-null values

In [None]:
singletype_data = pkmndata[pkmndata["Type 2"].isnull()]
dualtype_data = pkmndata[pkmndata["Type 2"].isnull() == False]

7.1.3. Check number of null values for each variable (col)

In [None]:
pkmndata_clean.isnull().sum()

#### 7.2. Duplicate Data

7.2.1. Find Duplicate Data

In [None]:
# find rows where Variable '#' is duplicated
dupid_data = pkmndata[pkmndata.duplicated("#", keep = False)]

7.2.2. Group Duplicate Data together and print out

In [None]:
# Pokemons with Duplicate IDs
print("Pokemons with Duplicate IDs :", len(dupid_data))
dupids = dupid_data["#"].unique()
print("Unique Pokemons with DupIDs :", len(dupids))
print()

# Group Pokemons with same ID
print("# \t Count \t List of Pokemons with Duplicate IDs")
print()
for dupid in dupids:
    dupid_list = list(dupid_data[dupid_data["#"] == dupid]["Name"])
    print(dupid, "\t", len(dupid_list), "\t", dupid_list)

#### 7.3. Sort

7.3.1. Sort Values

In [None]:
# sort in ascending order by Variable "Name"
dupid_data.sort_values(by = "Name")

# sort in descending order
dupid_data.sort_values(by = "Name", ascending=False)

#### 7.4. Rename a column

7.4.1. Rename

In [None]:
pkmndata_clean.rename(columns = {'#': 'ID'}, inplace = True)

7.4.2. Convert Name to Uppercase

In [None]:
pkmndata_clean.columns = pkmndata_clean.columns.str.upper()

7.4.3. Replace specified characters

In [None]:
# eg. remove all .
pkmndata_clean.columns = pkmndata_clean.columns.str.replace(".","")

# eg. replace all spaces with _
pkmndata_clean.columns = pkmndata_clean.columns.str.replace(" ","_")

7.4.4. Using Regular Expression (RegEx) with the `re` library to search and replace

In [None]:
import re

# eg. Fix names with extra Extensions
pkmndata_clean["NAME"] = pkmndata_clean["NAME"].apply(lambda x: re.sub(r'(.+)(Forme)',r'\1', x))
pkmndata_clean["NAME"] = pkmndata_clean["NAME"].apply(lambda x: re.sub(r'(Hoopa)(.+)',r'\2', x))

# eg. Fix names with Mega in between
pkmndata_clean["NAME"] = pkmndata_clean["NAME"].apply(lambda x: re.sub(r'(.+Mega)(.+)',r'\1', x))

# eg. Remove Blanks from all the Names
pkmndata_clean["NAME"] = pkmndata_clean["NAME"].apply(lambda x: re.sub(r'\s+','', x))

#### 7.5. Rename a row

In [None]:
# change the variable "NAME" of the row index == 7
pkmndata_clean.loc[7,"NAME"] = "CharizardMegaX"

#### Set a new index

In [None]:
# change index to variable "NAME"
pkmndata_clean = pkmndata_clean.set_index('NAME')

## 8. Linear Regression

using Scikit-Learn (`sklearn`)

8.1. Linear Regression model

In [None]:
# Imports
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Extract Response and Predictors
predictors = ["HP", "Attack", "Defense"]
y = pd.DataFrame(pkmndata["Total"])
X = pd.DataFrame(pkmndata[predictors])

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Check the sample sizes
print("Train Set :", y_train.shape, X_train.shape)
print("Test Set  :", y_test.shape, X_test.shape)

# Linear Regression using Train Data
linreg = LinearRegression()         # create the linear regression object
linreg.fit(X_train, y_train)        # train the linear regression model

# Predict y values (response) corresponding to X (predictors)
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

8.2. Coefficients

In [None]:
# Coefficients of the Linear Regression line
print('Intercept \t: b = ', linreg.intercept_)
print('Coefficients \t: a = ', linreg.coef_)

# Print the Coefficients against Predictors (for multi-variate linear regression)
pd.DataFrame(list(zip(X_train.columns, linreg.coef_[0])), columns = ["Predictors", "Coefficients"])

8.3. Linear Regression Line
> Just FYI, not really in use

In [None]:
# Formula for the Regression line
regline_x = X_train
regline_y = linreg.intercept_ + linreg.coef_ * X_train

# Plot the Linear Regression line on Scatterplot
f = plt.figure(figsize=(16, 8))
plt.scatter(X_train, y_train)
plt.plot(regline_x, regline_y, 'r-', linewidth = 2)
plt.xlabel("X")
plt.ylabel("y")
plt.show()

# Plot predictions on Scatterplot
f = plt.figure(figsize=(8, 18))
plt.scatter(X_train, y_train)
plt.scatter(X_train, y_train_pred)
plt.xlabel("GrLivArea")
plt.ylabel("SalePrice")
plt.show()

8.4. Plot Predictions vs. True Values

In [None]:
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'w-', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'w-', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()

8.5. Goodness of Fit

In [None]:
# Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()

# Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()

### Just copy paste: Linear Regression

In [None]:
# Import essential models and functions from sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Extract Response and Predictors
predictors = ["HP", "Attack", "Defense"]

y = pd.DataFrame(pkmndata["Total"])
X = pd.DataFrame(pkmndata[predictors])

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Linear Regression using Train Data
linreg = LinearRegression()         # create the linear regression object
linreg.fit(X_train, y_train)        # train the linear regression model

# Coefficients of the Linear Regression line
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()

# Print the Coefficients against Predictors
print(pd.DataFrame(list(zip(X_train.columns, linreg.coef_[0])), columns = ["Predictors", "Coefficients"]))
print()

# Predict Response corresponding to Predictors
y_train_pred = linreg.predict(X_train)
y_test_pred = linreg.predict(X_test)

# Plot the Predictions vs the True values
f, axes = plt.subplots(1, 2, figsize=(24, 12))
axes[0].scatter(y_train, y_train_pred, color = "blue")
axes[0].plot(y_train, y_train, 'w-', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable (Train)")
axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
axes[1].scatter(y_test, y_test_pred, color = "green")
axes[1].plot(y_test, y_test, 'w-', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable (Test)")
axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
plt.show()

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
print()

## 9. Classification Tree

using Scikit-Learn (`sklearn`)

9.1. Decision Tree Model

In [None]:
# Imports
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import confusion_matrix

In [None]:
# Extract Response and Predictors
predictors = ["Total", "HP", "Attack", "Defense"]
y = pd.DataFrame(pkmndata["Legendary"])
X = pd.DataFrame(pkmndata[predictors])

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Check the sample sizes
print("Train Set :", y_train.shape, X_train.shape)
print("Test Set  :", y_test.shape, X_test.shape)

# Decision Tree using Train Data
dectree = DecisionTreeClassifier(max_depth = 2)  # create the decision tree object
dectree.fit(X_train, y_train)                    # train the decision tree model

9.2. Plot Decision Tree

In [None]:
f = plt.figure(figsize=(12,12))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=X_train.columns,         # the variables involved in splitting
          class_names=["Ordinary","Legendary"])  # the classes to split into

9.3. Predict

In [None]:
# Predict y values corresponding to X
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)

9.3. Goodness of Fit

In [None]:
# Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(X_train, y_train))
print()

# Confusion matrix rates (on Train Data)
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()
print("True Positive Rate:\t", tp/(tp+fn))
print("True Negative Rate:\t", tn/(tn+fp))
print("False Positive Rate:\t", fp/(tn+fp))
print("False Negative Rate:\t", fn/(tp+fn))

# Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(X_test, y_test))
print()

# Confusion matrix rates (on Test Data)
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
print("True Positive Rate:\t", tp/(tp+fn))
print("True Negative Rate:\t", tn/(tn+fp))
print("False Positive Rate:\t", fp/(tn+fp))
print("False Negative Rate:\t", fn/(tp+fn))

9.4. Plot Confusion Matrix

In [None]:
# for individual datasets in different plots
# Train Data
sb.heatmap(confusion_matrix(y_train, y_train_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

# Test Data
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18})

In [None]:
# for both Train and Test in a single plot
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

9.5. Prediction of Class Probabilities

In [None]:
# Extract Predictors for Prediction
X_pred = pd.DataFrame(pkmndata_pred[predictors])

# Predict Probabilities corresponding to Predictors
y_prob = dectree.predict_proba(X_pred)

### Just copy paste: Decision Tree

In [None]:
# Import essential models and functions from sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Extract Response and Predictors
predictors = ["Total", "HP", "Attack", "Defense", "Sp. Atk", "Sp. Def", "Speed"]

y = pd.DataFrame(pkmndata['Type 1'].astype('category'))
X = pd.DataFrame(pkmndata[predictors]) 

# Split the Dataset into Train and Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

# Decision Tree using Train Data
dectree = DecisionTreeClassifier(max_depth = 3)  # create the decision tree object
dectree.fit(X_train, y_train)                    # train the decision tree model

# Predict Response corresponding to Predictors
y_train_pred = dectree.predict(X_train)
y_test_pred = dectree.predict(X_test)

# Check the Goodness of Fit (on Train Data)
print("Goodness of Fit of Model \tTrain Dataset")
print("Classification Accuracy \t:", dectree.score(X_train, y_train))
print()

# Check the Goodness of Fit (on Test Data)
print("Goodness of Fit of Model \tTest Dataset")
print("Classification Accuracy \t:", dectree.score(X_test, y_test))
print()

# Plot the Confusion Matrix for Train and Test
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sb.heatmap(confusion_matrix(y_train, y_train_pred),
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[0])
sb.heatmap(confusion_matrix(y_test, y_test_pred), 
           annot = True, fmt=".0f", annot_kws={"size": 18}, ax = axes[1])

In [None]:
# Plot the trained Decision Tree
from sklearn.tree import plot_tree

f = plt.figure(figsize=(12,12))
plot_tree(dectree, filled=True, rounded=True, 
          feature_names=X_train.columns, 
          class_names=["Ordinary","Legendary"])