## Importing, cleaning and wrangling

In [24]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic

In [25]:
# import data with a context manager
with open("data_ks.csv") as fp:
    df = pd.read_csv(fp) 

    #names = df['last_name'].tolist()
    
    # Drop unnecessary columns from df
    dropcol = ['accessory_channels', 'submitted_date', 'token']
    df.drop(columns= dropcol, inplace= True)
    
    # convert to categorical data in the df
    catcol = ["age", "gender", "platform", "income", "experience", "rides_per_week", "num_bikes_owned", "bike_cost", "location", "environment", 'accessory_factor' ]
    for col in catcol:
        df[col] = df[col].astype('category')

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1691 entries, 0 to 1690
Data columns (total 22 columns):
age                        1691 non-null category
gender                     1691 non-null category
platform                   1691 non-null category
income                     1691 non-null category
experience                 1691 non-null category
rides_per_week             1691 non-null category
bike_type                  1691 non-null object
num_bikes_owned            1691 non-null category
bike_cost                  1691 non-null category
location                   1691 non-null category
environment                1691 non-null category
companions                 1691 non-null object
conditions                 1691 non-null object
activities                 1691 non-null object
reasons_to_bike            1691 non-null object
reasons_to_not_bike        1691 non-null object
reason_for_smarthalo       1691 non-null object
has_smarthalo_1            1691 non-null object
accessory

In [27]:
for i in catcol:
    print(df[i].value_counts(dropna=False), '\n')

25-34 years    481
35-44 years    472
45-54 years    289
55+ years      285
0-25 years     108
45-45 years     56
Name: age, dtype: int64 

Male                 1431
Female                244
Prefer not to say       9
Non-binary              7
Name: gender, dtype: int64 

iOS        962
Android    729
Name: platform, dtype: int64 

40,000 - 79,000 / year    573
80,000+ / year            564
0 - 39,999 / year         301
Prefer not to say         253
Name: income, dtype: int64 

8     473
7     463
6     248
9     186
5     121
10    106
4      50
3      23
2      12
1       9
Name: experience, dtype: int64 

3 or more times a week    942
1-2 times a week          520
Less than once a week     229
Name: rides_per_week, dtype: int64 

1       682
2       588
3+      410
None     11
Name: num_bikes_owned, dtype: int64 

1000 - 2000      554
500 - 1000       490
2000 - 4000      309
100 - 500        204
4000+            122
Less than 100     12
Name: bike_cost, dtype: int64 

United States

## Exploring ML as analysis tool

### Unsupervized learning in Python Chap3 p. 33
scipy.sparse.csr_matrix

In [28]:
to_sparse = ['bike_type', 'companions', 'conditions', 'activities', 'reasons_to_bike', 'reasons_to_not_bike', 'reason_for_smarthalo']

print('Number of unique combinaison of answer for each questions', '\n')
for i in to_sparse:
    print('{}:  '.format(i.capitalize()), df[i].nunique())

Number of unique combinaison of answer for each questions 

Bike_type:   337
Companions:   33
Conditions:   137
Activities:   750
Reasons_to_bike:   1016
Reasons_to_not_bike:   720
Reason_for_smarthalo:   537


### NLP tokenization & sklearn CountVectorizer() 

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

# Instantiate the CountVectorizer: vec
vec = CountVectorizer(tokenizer=lambda x: x.split(', '))  # Object to convert in matrix of token counts 

# Fit to the data
for i in to_sparse:
    vec.fit(df[i]) # Convert df.bike_type to matrix of token
    print(vec.get_feature_names(), '\n')

#Archives
# TOKENS = '(?:[a-zA-Z1-9/-]|\s)' # Take only strings between ',' as token but, lamba function x.split(', ') works better   
# print("There are {} tokens in the dataset".format(len(vec.get_feature_names())))

['a brompton', 'a foldable bike', 'bike share bike', 'bmx', 'brompton', 'brompton folding bike', 'cargo bike', 'cross bike', 'crossbike', 'crossbike ', 'cruiser / city bike', 'cyclo cross', 'cyclocross', 'dirt/gravel trike', 'dual speed city road bike', 'dutch bike', 'e-bike', 'e-mountainbike', 'e-road bike', 'e-trike', 'elliptigo', 'fat bike', 'floding bikes', 'foldable', 'foldable bike', 'foldie', 'folding bicycle', 'folding bike', 'folding bike ', 'folding bike (brompton)', 'folding bike (dahon mu)', 'folding commuter bike', 'gravel bike', 'halfbike', 'holiday bike', 'hometrainer', 'hybrid bike', 'i have inokim electronic scooter', 'minivero', 'mountain bike', 'polo bike', 'race bike', 'recumbent', 'recumbent trike', 'road bike', 'scooter', 'short frame recumbent', 'single-speed bike', 'spin bike', 'tandem', 'tandem bike', "tandem bike (bike friday 'q')", 'tempo', 'time trial bike', 'tourbike', 'touring bike', 'tracking bike', 'travel bike', 'trekking', 'trekking ', 'trekking bike',

In [30]:
# {'col_to_parse': {'value_to_replace': 'replace_with_value'}}
replace_val = {'bike_type': {'gravel bike': 'all road', 'Brompton': 'foldable', 'Brompton folding ': 'foldable', 'folding': 'foldable', 'FALTRAD': 'foldable', 'fat': 'fat bike', 'recumbent trike': 'recumbent', 'speed pedelec (45km/h e-bike)': 'e-bike', 'time-trial race bike': 'road bike', 'touring': 'touring bike'}}
df.bike_type.replace(replace_val, inplace=True)

In [31]:
vec.fit(df['bike_type'])
vec.get_feature_names()

['a brompton',
 'a foldable bike',
 'bike share bike',
 'bmx',
 'brompton',
 'brompton folding bike',
 'cargo bike',
 'cross bike',
 'crossbike',
 'crossbike ',
 'cruiser / city bike',
 'cyclo cross',
 'cyclocross',
 'dirt/gravel trike',
 'dual speed city road bike',
 'dutch bike',
 'e-bike',
 'e-mountainbike',
 'e-road bike',
 'e-trike',
 'elliptigo',
 'fat bike',
 'floding bikes',
 'foldable',
 'foldable bike',
 'foldie',
 'folding bicycle',
 'folding bike',
 'folding bike ',
 'folding bike (brompton)',
 'folding bike (dahon mu)',
 'folding commuter bike',
 'gravel bike',
 'halfbike',
 'holiday bike',
 'hometrainer',
 'hybrid bike',
 'i have inokim electronic scooter',
 'minivero',
 'mountain bike',
 'polo bike',
 'race bike',
 'recumbent',
 'recumbent trike',
 'road bike',
 'scooter',
 'short frame recumbent',
 'single-speed bike',
 'spin bike',
 'tandem',
 'tandem bike',
 "tandem bike (bike friday 'q')",
 'tempo',
 'time trial bike',
 'tourbike',
 'touring bike',
 'tracking bike',


In [32]:
df_text = df[to_sparse].apply(lambda x: ", ".join(x), axis=1)
df_text.head()

0    E-bike, With family and friends, Temperate, Mo...
1    Mountain bike, Hybrid bike, Road bike, I bike ...
2    Cruiser / City bike, With family and friends, ...
3    Road bike, Mountain bike, I bike alone, Night,...
4    Road bike, With family and friends, Hot/Humid,...
dtype: object

In [33]:
# Instantiate the CountVectorizer: vec
vec = CountVectorizer(tokenizer=lambda x: x.split(', '))  # Object to convert in matrix of token counts 

# Fit to the data
X = vec.fit_transform(df_text) # Convert df.bike_type to matrix of token
print("There are {} tokens in the dataset".format(len(vec.get_feature_names())))

There are 192 tokens in the dataset


In [34]:
vec.get_feature_names()

['',
 ' because i am fascinated of the thing',
 '<kombination of 4 tools',
 'a brompton',
 'a foldable bike',
 'a useful tool',
 'acctually i intend to try it on my motorbike',
 'all above',
 'as a fancy gps bike speedometer (my current one is not precise at all)',
 'as a gift',
 'as a gift ',
 'as a gift for my cycle loving in laws',
 'as a present',
 'because it looks to be well-priced',
 "because it's cool!",
 'because smarthalo 1 was not so good',
 'bike share bike',
 'bmx',
 'brewery hopping ',
 'brompton',
 'brompton folding bike',
 'can use it in the winter!! :)',
 'cargo bike',
 'chores / errands',
 'commuting',
 'cross bike',
 'crossbike',
 'crossbike ',
 'cruiser / city bike',
 'curiosity',
 'curious about some new things',
 'cyclo cross',
 'cyclocross',
 'davis',
 'deux montagnes rem project pushed me to bike from laval to downtown montreal next year',
 'dirt/gravel trike',
 'driving a car and drinking doesn’t match',
 'dual speed city road bike',
 'dutch bike',
 'e-bike',
 

In [65]:
# X_train, X_test, y_train, y_test = train_test_split(vec, random_state=456)
df_matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

  (0, 39)	1
  (0, 188)	1
  (0, 150)	1
  (0, 118)	1
  (0, 169)	1
  (0, 49)	1
  (0, 109)	1
  (0, 87)	1
  (0, 108)	1
  (0, 77)	1
  (0, 154)	1
  (0, 158)	1
  (0, 162)	1
  (0, 137)	1
  (0, 161)	1
  (1, 150)	1
  (1, 49)	1
  (1, 87)	1
  (1, 108)	1
  (1, 154)	1
  (1, 161)	1
  (1, 117)	1
  (1, 69)	1
  (1, 135)	1
  (1, 70)	1
  :	:
  (1689, 106)	1
  (1689, 107)	1
  (1689, 88)	1
  (1689, 101)	1
  (1689, 144)	1
  (1689, 68)	1
  (1689, 172)	1
  (1689, 81)	1
  (1689, 104)	1
  (1690, 188)	1
  (1690, 150)	1
  (1690, 118)	1
  (1690, 77)	1
  (1690, 154)	1
  (1690, 117)	1
  (1690, 127)	1
  (1690, 78)	1
  (1690, 110)	1
  (1690, 14)	1
  (1690, 144)	1
  (1690, 131)	1
  (1690, 112)	1
  (1690, 81)	1
  (1690, 74)	1
  (1690, 142)	1


### Unsupervized learning in Python Chap1 p. 31
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(samples)
StandardScaler(copy=True, with_mean=True, with_std=True)
samples_scaled = scaler.transform(samples)

### Unsupervized learning in Python Chap4 p. 7
NMF with word frequency array

In [36]:
from sklearn.decomposition import NMF

matrix = X.toarray()
model = NMF(n_components=5)
model.fit(matrix)
nmf_features = model.transform(matrix)

In [37]:
#nmf_features
#df_nmf = pd.DataFrame(nmf_features,index=names)
#df_nmf.loc['Denis']

In [38]:
nmf_features

array([[0.12231073, 0.02323385, 0.20427926, 0.12573124, 0.        ],
       [0.15423672, 0.26036853, 0.00738726, 0.18873421, 0.05855126],
       [0.00259247, 0.14280095, 0.        , 0.07509475, 0.24108659],
       ...,
       [0.09357699, 0.15245619, 0.02110561, 0.03529629, 0.29160295],
       [0.19852674, 0.26982555, 0.21820291, 0.        , 0.11330575],
       [0.12180771, 0.        , 0.        , 0.18326583, 0.07428773]])

In [63]:
# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_, columns=vec.get_feature_names())

# Print the shape of the DataFrame
print(components_df.shape)

for x in range(0,components_df.shape[0]):
    print("\nPersona "+str(x))
    print("==========")
    component = components_df.iloc[x,:]
    print(component.nlargest(50))
    print("\n")

(5, 192)

Persona 0
the weather doesn't suit me                                                            3.966413
temperate                                                                              2.784269
i want to be healthy                                                                   2.559829
i bike alone                                                                           2.521009
because it's cool!                                                                     2.384590
to help me navigate                                                                    2.306264
so i don't have to look at my phone while riding                                       2.263653
pleasure rides                                                                         1.981890
it's not safe for me                                                                   1.909435
fitness                                                                                1.717054
i have to carry thin

In [40]:
components_df.head()

Unnamed: 0,Unnamed: 1,because i am fascinated of the thing,<kombination of 4 tools,a brompton,a foldable bike,a useful tool,acctually i intend to try it on my motorbike,all above,as a fancy gps bike speedometer (my current one is not precise at all),as a gift,...,vintage road,want to try it,want to try new product,when friends use my bike i know it will help them and be safer,which is irrelevant to me and most serious cyclists (as we have dedicated lights for biking).,with a riding pack,with family and friends,with young children,work,workout
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002037,0.0,0.002375,...,0.002109,0.005533,0.000427,0.0,0.0,0.0,0.456406,0.115858,0.0,0.0
1,0.0,0.0,0.005857,0.005254,0.005254,0.0,0.002176,0.0,0.0,0.0,...,0.0,0.0,0.0,0.005238,0.000539,0.0,0.0,0.052274,1.217513,0.001758
2,0.005596,0.0,0.0,0.001145,0.001145,0.005596,0.002191,0.0,0.0,0.0,...,0.004207,0.0,0.0,0.0,0.0,0.022115,0.353237,0.076944,0.05455,0.003752
3,0.00106,0.0,0.0,0.0,0.0,0.00106,0.0,0.0,0.0,0.0,...,0.000581,0.001229,0.006659,0.0,0.0,0.676213,2.753745,0.088534,0.061165,0.000745
4,0.0,0.006978,0.0,0.0,0.0,0.0,0.0,0.0,0.007223,0.001529,...,0.0,0.0,0.0,0.0,0.004543,0.0,0.0,0.380615,0.650636,0.0


In [41]:
from sklearn.preprocessing import normalize

# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)

# Create a DataFrame: df
dfpersona = pd.DataFrame(norm_features, index=names)

# Select the row corresponding to 'Cristiano Ronaldo': article
persona = dfpersona.loc['Haywood']

# Compute the dot products: similarities
similarities = dfpersona.dot(persona)

# Display those with the largest cosine similarity
print(similarities.nlargest(20))

NameError: name 'names' is not defined