In [None]:
import pandas as pd
from path import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
#from sklearn.datasets import load_Group6_Project4_data

### Data Preparation

In [2]:
file = Path('../data/merged_data_file.csv')
# set first line as index-- index_col=0
df = pd.read_csv(file, index_col=0)
df.info

<bound method DataFrame.info of        Unnamed: 0.1          player_name   position                team  conf  \
0                 0        Isaiah Felder     Wing G  South Carolina St.  MEAC   
1                 1  Jalen Coleman-Lands     Wing G              Kansas   B12   
2                 2          K.J. Walton     Wing G               Akron   MAC   
3                 3         Jeriah Horne  Stretch 4               Tulsa  Amer   
4                 4           Eric Curry     Wing F           Minnesota   B10   
...             ...                  ...        ...                 ...   ...   
51199         61056       Trey Patterson    Pure PG           Villanova    BE   
51200         61057   Stavros Polatoglou          C    Northwestern St.  Slnd   
51201         61058           Sandy Ryan       PF/C              Tulane  Amer   
51202         61059            Ty Larson       PF/C          Texas Tech   B12   
51203         61060          Jaden Jones    Pure PG             Rutgers   B10

In [3]:
df.head()

Unnamed: 0,Unnamed: 0.1,player_name,position,team,conf,games_played,game_min_%,pts_per_game,field_goal_%,3_pt_%,...,blk_per_game,offensive_rating,usage,classification,year,Unnamed: 0_addinfo,Position,Height,Weight,process_status
0,0,Isaiah Felder,Wing G,South Carolina St.,MEAC,11,17.6,2.3636,34.7,0.25,...,0.0,61.1,18.6,Sr,2022,,,,,
1,1,Jalen Coleman-Lands,Wing G,Kansas,B12,23,78.5,14.3043,54.0,0.395,...,0.3,103.1,21.5,Sr,2022,,,,,
2,2,K.J. Walton,Wing G,Akron,MAC,20,63.0,16.35,54.3,0.0,...,0.7,108.6,26.5,Sr,2022,,,,,
3,3,Jeriah Horne,Stretch 4,Tulsa,Amer,32,61.3,10.8125,55.2,0.397,...,0.3,116.0,20.8,Sr,2022,,,,,
4,4,Eric Curry,Wing F,Minnesota,B10,29,39.2,3.6552,46.3,0.19,...,1.5,95.1,14.7,Sr,2022,,,,,


In [4]:
df.columns

Index(['Unnamed: 0.1', 'player_name', 'position', 'team', 'conf',
       'games_played', 'game_min_%', 'pts_per_game', 'field_goal_% ', '3_pt_%',
       'Free_throw_%', 'offensive_rebound_per_game',
       'defensive_ rebound_per_game', 'AST_per_game', 'stl_per_game',
       'blk_per_game', 'offensive_rating', 'usage', 'classification', 'year',
       'Unnamed: 0_addinfo', 'Position', 'Height', 'Weight', 'process_status'],
      dtype='object')

In [5]:
# List all algorithms in use- 1
df['Draft Pick Y/N'].unique()
df

KeyError: 'Draft Pick Y/N'

In [None]:
# List all algorithms in use- 2
algorithms = set(df.Algorithm.values)
print(algorithms)

In [None]:
# print the counts of stocks being currently traded vs. not being currently traded 
df['IsTrading'].value_counts()

In [None]:
# Select only cryptocurrencies that are being traded
df = df.loc[df['IsTrading'] == True]
df['IsTrading'].value_counts()

In [None]:
currency= df.CoinName.values
print(currency)
#df['CoinName'].unique()

In [None]:
# Delete the IsTrading column; it is no longer necessary
df=df.drop('IsTrading',axis = 1)
df

In [None]:
# Delete all rows with null values
df1 = df.dropna()
df1

In [None]:
# get instances and variables of df- shape
df1 = df.dropna(axis=0, how='any')
df1.shape

In [None]:
# Sort values in TotalCoinsMined to identify negative or zero values
#df = df1['TotalCoinsMined'].sort_values()
#df
df1.TotalCoinsMined.sort_values()

In [None]:
# Select only rows with positive TotalCoinsMined values
df1= df1[df1['TotalCoinsMined']>0]
len(df1)

In [None]:
# Delete the CoinName column from the original dataframe
df1 = df1.drop('CoinName',axis=1)
df1

In [None]:
# Create dummy variables for columns with string values
# pd.get_dummies(data, prefix, dummy_na, columns, drop_first)
x_crypto = pd.get_dummies(df1, columns=['Algorithm', 'ProofType'])
#x_crypto = pd.get_dummies(data=crypto_df, columns=['Algorithm', 'ProofType'])

# Print encoding variable
print(x_crypto.shape)

In [None]:
# Display the dataframe
x_crypto.head()

In [None]:
# Standardize the data
x_scaler = StandardScaler()
crypto_scaler = x_scaler.fit_transform(x_crypto)

In [None]:
# print the scaled data
crypto_scaler[0]

In [None]:
# Identify the numbers of rows and columns in the scaled data
crypto_scaler.shape

### Reduce dataset dimensions with PCA

In [None]:
# Reduce dimensions with PCA- decomposition
pca1 = PCA(n_components = 0.9)

In [None]:
crypto_pca = pca1.fit_transform(crypto_scaler)
crypto_pca.shape

In [None]:
# The sum of the explained variance of the principal components
pca1.explained_variance_.sum()

In [None]:
# Reduce dimensions with t-SNE
crypto_tsne = TSNE(perplexity = 60)
tsne_features = crypto_tsne.fit_transform(crypto_pca)
tsne_features.shape

In [None]:
# Plot t-SNE output (x -60 10, y -20 50)
x_axis = tsne_features[:,0]
y_axis = tsne_features[:,1]
plt.scatter(x_axis, y_axis)
plt.show

In [None]:
# Identify clusters with k-means
inertia = []
k = [1,2,3,4,5,6,7,8,9,10] 
# list(range(1,10))

# get the inertia for the range of k values
for i in k:
    km = KMeans(n_clusters= i , random_state=0)
    km.fit(crypto_pca)
    inertia.append(km.inertia_)

In [None]:
# create elbow df
elbow_data = {"k": k, "inertia": inertia}
df2 = pd.DataFrame(elbow_data)

In [None]:
# Create an elbow plot x 2,4,6,8,10, y 39000 47000
plt.plot(df2['k'], df2['inertia'])
plt.xticks(range(1,10))
#,[2,4,6,8,10]
plt.xlabel('k/ number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Plot')
plt.legend('inertia')
plt.show()