In [1]:
import pandas as pd
import numpy as np

Immaginiamo di avere strutturato i dati di palestre, passi, squat, pushups, google trends, impianti sportivi per 100000 abitanti:

SE NON SONO TUTTI ORDINATI ALLO STESSO MODO A LIVELLO DI REGIONI, TUTTI I DATASET VANNO ORDINATI IN ORDINE ALFABETICO IN BASE ALLA REGIONE, PER EVITARE CALCOLI SBAGLIATI (poi non so se userai query SQL, però credo sia bene averli ordinati alfabeticamente; vedi tu)

In [2]:
#Loading simulated mobile apps data
stepsdf = pd.read_csv("stepsaggregated.csv")
squatsdf = pd.read_csv("squatsaggregated.csv")
pushupsdf = pd.read_csv("pushupsaggregated.csv")
#Loading daily Google trends data (research interest in sport)
googletrendsdf = pd.read_csv("researchinterestsport.csv")


In [3]:
#ACCESSI ALLE PALESTRE: DATAFRAME
accessi = [i*5 for i in range(20)]
np.random.shuffle(accessi)

#generating a random accesses dataframe
gymaccessesdf = pd.DataFrame((list(zip(stepsdf.region,accessi))))

gymaccessesdf.columns = ['region','accesses']

In [4]:
#IMPIANTI SPORTIVI OGNI 100000 ABITANTI: DATAFRAME
#Simulazione casuale
impianti = [i*5 for i in range(20)]
np.random.shuffle(impianti)

#generating a random accesses dataframe
facilitiesper100k = pd.DataFrame((list(zip(stepsdf.region,impianti))))

facilitiesper100k.columns = ['region','sport_facilities']

Tutti questi dati verranno normalizzati. Intanto, ne salviamo una copia da lasciare non normalizzata:

In [5]:
#Loading simulated mobile apps data
stepsdfraw = pd.read_csv("stepsaggregated.csv")
squatsdfraw = pd.read_csv("squatsaggregated.csv")
pushupsdfraw = pd.read_csv("pushupsaggregated.csv")
#Loading daily Google trends data (research interest in sport)
googletrendsdfraw = pd.read_csv("researchinterestsport.csv")


#Accessi palestre
gymaccessesdfraw = pd.DataFrame((list(zip(stepsdf.region,accessi))))
gymaccessesdfraw.columns = ['region','accesses']

#Facilities
#generating a random accesses dataframe
facilitiesper100kraw = pd.DataFrame((list(zip(stepsdf.region,impianti))))

facilitiesper100kraw.columns = ['region','sport_facilities']

# Daily steps data
I dati di squat e flessioni sono analoghi nella struttura

In [6]:
stepsdf.head()

Unnamed: 0,region,steps
0,Abruzzo,4379.75484
1,Basilicata,3230.39198
2,Calabria,3166.57721
3,Campania,2933.24814
4,Emilia-Romagna,5051.28323


# Google trends data

In [7]:
googletrendsdf.head()

Unnamed: 0,geoName,sport
0,Abruzzo,86
1,Basilicata,80
2,Calabria,82
3,Campania,75
4,Emilia-Romagna,86


# Gym accesses data (hypothesis)

In [8]:
gymaccessesdf.head()

Unnamed: 0,region,accesses
0,Abruzzo,85
1,Basilicata,35
2,Calabria,25
3,Campania,55
4,Emilia-Romagna,75


# Sport facilities accesses data (hypothesis)

In [9]:
facilitiesper100k.head()

Unnamed: 0,region,sport_facilities
0,Abruzzo,70
1,Basilicata,65
2,Calabria,95
3,Campania,85
4,Emilia-Romagna,40


Come prima cosa li **normalizziamo** tutti, in scala 0-1:

In [10]:
#Normalizing mobile apps data
stepsdf['steps'] = (stepsdf['steps'] - min(stepsdf['steps'])) / (max(stepsdf['steps']) - min(stepsdf['steps']))
squatsdf['squats'] = (squatsdf['squats'] - min(squatsdf['squats'])) / (max(squatsdf['squats']) - min(squatsdf['squats']))
pushupsdf['pushups'] = (pushupsdf['pushups'] - min(pushupsdf['pushups'])) / (max(pushupsdf['pushups']) - min(pushupsdf['pushups']))


#Rounding mobile apps data after normalization
stepsdf['steps'] = round(stepsdf['steps'],2)
squatsdf['squats']  = round(squatsdf['squats'],2)
stepsdf['pushups'] = round(pushupsdf['pushups'],2)


In [11]:
#Normalizing Google Trends data

googletrendsdf['sport'] = (googletrendsdf['sport'] - min(googletrendsdf['sport'])) / (max(googletrendsdf['sport']) - min(googletrendsdf['sport']))

googletrendsdf['sport'] = round(googletrendsdf['sport'],2)


In [12]:
#Normalizing gym accesses data

gymaccessesdf['accesses'] = (gymaccessesdf['accesses'] - min(gymaccessesdf['accesses'])) / (max(gymaccessesdf['accesses']) - min(gymaccessesdf['accesses']))
gymaccessesdf['accesses'] = round(gymaccessesdf['accesses'],2)


In [13]:
#Normalizing sport facilities data

facilitiesper100k['sport_facilities'] = (facilitiesper100k['sport_facilities'] - min(facilitiesper100k['sport_facilities'])) / (max(facilitiesper100k['sport_facilities']) - min(facilitiesper100k['sport_facilities']))

facilitiesper100k['sport_facilities'] = round(facilitiesper100k['sport_facilities'],2)


# Normalized data:
**SQUATS**

In [14]:
squatsdf.head()

Unnamed: 0,region,squats
0,Abruzzo,0.4
1,Basilicata,0.08
2,Calabria,0.04
3,Campania,0.0
4,Emilia-Romagna,0.6


**GOOGLE TRENDS**

In [15]:
googletrendsdf.head()

Unnamed: 0,geoName,sport
0,Abruzzo,0.6
1,Basilicata,0.43
2,Calabria,0.49
3,Campania,0.29
4,Emilia-Romagna,0.6


**SPORT FACILITIES**

In [16]:
facilitiesper100k.head()

Unnamed: 0,region,sport_facilities
0,Abruzzo,0.74
1,Basilicata,0.68
2,Calabria,1.0
3,Campania,0.89
4,Emilia-Romagna,0.42


**GYM ACCESSES**

In [17]:
gymaccessesdf.head()

Unnamed: 0,region,accesses
0,Abruzzo,0.89
1,Basilicata,0.37
2,Calabria,0.26
3,Campania,0.58
4,Emilia-Romagna,0.79


# COSTRUZIONE DELL'INDICE DI SPORTIVITA'
# (SPORTINESS INDEX)

**MEDIA DI TUTTI GLI INDICI PRECEDENTI**

6 indicatori: 

**accessi palestre**, 

**palestre per 100000 persone**, 

**dati google trends**,

**dati squats**

**dati pushups**

**dati steps**

In [18]:
#Media dei 6 indicatori
sportiness_index = (squatsdf['squats']+stepsdf['steps']+pushupsdf['pushups']+googletrendsdf['sport']+facilitiesper100k['sport_facilities']+gymaccessesdf['accesses']) / 6
#Arrotondo
sportiness_index = round(sportiness_index, 2)

**DATASET CON SPORTINESS INDEX**

In [19]:
sportinessdf = pd.DataFrame(zip(squatsdf.region, sportiness_index))
sportinessdf.columns = ['region','sportiness_index']

In [20]:
sportinessdf.head()

Unnamed: 0,region,sportiness_index
0,Abruzzo,0.57
1,Basilicata,0.29
2,Calabria,0.32
3,Campania,0.29
4,Emilia-Romagna,0.61


# Dataset finale

CARICO I DATI, PER OGNI REGIONE, DELLO SPORTINESS INDEX APPENA CREATO (è in scala 0-1), E DEGLI ALTRI INDICATORI (non normalizzati; quelli originali):

In [21]:
finaldata = pd.DataFrame()
finaldata['region'] = squatsdf['region']
finaldata['sportiness_index'] = sportinessdf['sportiness_index']
finaldata['steps'] = stepsdfraw['steps']
finaldata['squats'] = squatsdfraw['squats']
finaldata['pushups'] = pushupsdfraw['pushups']
finaldata['gym_accesses'] = gymaccessesdf['accesses']
finaldata['google_research'] = googletrendsdfraw['sport']
finaldata['sport_facilities_per100k'] = facilitiesper100k['sport_facilities']

In [22]:
finaldata.head()

Unnamed: 0,region,sportiness_index,steps,squats,pushups,gym_accesses,google_research,sport_facilities_per100k
0,Abruzzo,0.57,4379.75484,31.501154,16.486795,0.89,86,0.74
1,Basilicata,0.29,3230.39198,23.500256,12.513077,0.37,80,0.68
2,Calabria,0.32,3166.57721,22.501923,12.485256,0.26,82,1.0
3,Campania,0.29,2933.24814,21.503846,11.499359,0.58,75,0.89
4,Emilia-Romagna,0.61,5051.28323,36.515385,19.49141,0.79,86,0.42
