In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Reading Files

In questo file vediamo come importare dati di tipo tabulare e caricarli come Dataframe

In [2]:
dataset = pd.read_csv('yelp.csv', index_col=None)
print(type(dataset))
dataset.head()

# L'opzione index_col mi consente di selezionare una delle colonne come indice; ad es se voglio selezionare business_id
# come indice scrivo index_col=0; se voglio un indice che non corrisponde a nessuna colonna lo setto a False oppure None

# il campo index_col si valorizza col nome della colonna che voglio usare come index; scrivendo None
# dico che non c'è nessuna colonna che voglio usare come indice

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


Il file csv è stato importato come Dataframe


In [3]:
print(len(dataset))   # ritorna il numero di righe
print(dataset.shape)  # ritorna il numero di righe e colonne

10000
(10000, 10)


## Data Operations

### Selezione di righe/colonne

Una colonna o una riga di un Dataframe possono essere selezionati tramite il nome delle colonne e righe (indice).

In [4]:
# Colonna
print(dataset['business_id'])

# Prime 2 colonne
print(dataset[['business_id','user_id']])

# Riga
print(dataset.loc[0:3])          # prime 3 righe

0       9yKzy9PApeiPPOUJEtnvkg
1       ZRJwVLyzEJq1VAihDhYiow
2       6oRAC4uyJCsJl1X0WZpVSA
3       _1QQZuf4zZOyFCvXc0o6Vg
4       6ozycU1RpktNG2-1BroVtw
                 ...          
9995    VY_tvNUCCXGXQeSvJl757Q
9996    EKzMHI1tip8rC1-ZAy64yg
9997    53YGfwmbW73JhFiemNeyzQ
9998    9SKdOoDHcFoxK5ZtsgHJoA
9999    pF7uRzygyZsltbmVpjIyvw
Name: business_id, Length: 10000, dtype: object
                 business_id                 user_id
0     9yKzy9PApeiPPOUJEtnvkg  rLtl8ZkDX5vH5nAx9C3q5Q
1     ZRJwVLyzEJq1VAihDhYiow  0a2KyEL0d3Yb1V6aivbIuQ
2     6oRAC4uyJCsJl1X0WZpVSA  0hT2KtfLiobPvh6cDC8JQg
3     _1QQZuf4zZOyFCvXc0o6Vg  uZetl9T0NcROGOyFfughhg
4     6ozycU1RpktNG2-1BroVtw  vYmM4KTsC8ZfQBg-j5MWkw
...                      ...                     ...
9995  VY_tvNUCCXGXQeSvJl757Q  _eqQoPtQ3e3UxLE4faT6ow
9996  EKzMHI1tip8rC1-ZAy64yg  ROru4uk5SaYc3rg8IU7SQw
9997  53YGfwmbW73JhFiemNeyzQ  gGbN1aKQHMgfQZkqlsuwzg
9998  9SKdOoDHcFoxK5ZtsgHJoA  0lyVoNazXa20WzUyZPLaQQ
9999  pF7uRzygyZsltbmVpjIyvw

### Filtraggio dei dati

Il filtraggio dei dati si fa andando a specificare una condizione (booleana) all'interno delle parentesi quadre.

In [5]:
dataset[dataset['cool']>1]     # mi ritorna tutte le righe del dataframe con cool > 1

# N.B: quando passo una condizione booleana per il filtraggio, mi vengono restituiti i risultati corrispondenti a 'True'

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
5,-yxfBYGB6SEqszmxJxd97A,2007-12-13,m2CKSsepBCoRYWxiRUsxAg,4,"Quiessence is, simply put, beautiful. Full wi...",review,sqYN3lNgvPbPCTRsMFu27g,4,3,1
6,zp713qNhx8d9KCJJnrw1xA,2010-02-12,riFQ3vxNpP4rWLk_CSri2A,5,Drop what you're doing and drive here. After I...,review,wFweIWhv2fREZV_dYkz_1g,7,7,4
16,supigcPNO9IKo6olaTNV-g,2008-10-12,HXP_0Ul-FCmA4f-k9CqvaQ,3,We went here on a Saturday afternoon and this ...,review,SBbftLzfYYKItOMFwOTIJg,3,4,2
18,b5cEoKR8iQliq-yT2_O0LQ,2009-03-06,v0cTd3PNpYCkTyGKSpOfGA,3,I met a friend for lunch yesterday. \n\nLoved ...,review,UsULgP4bKA8RMzs8dQzcsA,5,6,4
...,...,...,...,...,...,...,...,...,...,...
9977,iDYzGVIF1TDWdjHNgNjCVw,2012-10-30,qaNZyCUJA6Yp0mvPBCknPQ,5,Why did I wait so long to try this neighborhoo...,review,Id-8-NMEKxeXBR44eUdDeA,3,6,3
9979,GV1P1x9eRb4iZHCxj5_IjA,2012-12-07,eVUs1C4yaVJNrc7SGTAheg,5,Highly recommend. This is my second time here ...,review,bJFdmJJxfXgCYA5DMmyeqQ,2,2,1
9980,GHYOl_cnERMOhkCK_mGAlA,2011-07-03,Q-y3jSqccdytKxAyo1J0Xg,5,5 stars for the great $5 happy hour specials. ...,review,xZvRLPJ1ixhFVomkXSfXAw,6,6,4
9981,AX8lx9wHNYT45lyd7pxaYw,2008-11-27,IyunTh7jnG7v3EYwfF3hPw,5,We brought the entire family to Giuseppe's las...,review,fczQCSmaWF78toLEmb0Zsw,10,9,5


In [6]:
# Il filtraggio lo posso fare anche sulle colonne
dataset['business_id']

0       9yKzy9PApeiPPOUJEtnvkg
1       ZRJwVLyzEJq1VAihDhYiow
2       6oRAC4uyJCsJl1X0WZpVSA
3       _1QQZuf4zZOyFCvXc0o6Vg
4       6ozycU1RpktNG2-1BroVtw
                 ...          
9995    VY_tvNUCCXGXQeSvJl757Q
9996    EKzMHI1tip8rC1-ZAy64yg
9997    53YGfwmbW73JhFiemNeyzQ
9998    9SKdOoDHcFoxK5ZtsgHJoA
9999    pF7uRzygyZsltbmVpjIyvw
Name: business_id, Length: 10000, dtype: object

Proviamo a combinare i filtri.

In [7]:
print(dataset[['cool','user_id','stars']][dataset['cool']>5])    # applico un filtro per visulizzare solo le righe con 'cool>5'
                                                                 # mi faccio restituire solo le 3 colonne che mi interessano    

      cool                 user_id  stars
6        7  wFweIWhv2fREZV_dYkz_1g      5
82      11  mfvezpz6ohS0NQk3DZdvqQ      5
90       7  8tbXmjYGsYFZXk6ppuwRWQ      1
112      6  QKW7sYPWPSsIcWqSiDzChQ      5
125      8  C6IOtaaYdLIT5fWd7ZYIuA      5
...    ...                     ...    ...
9904    13  eBwBjylS66qPcHs2_ajLag      4
9942     8  l81ILmOhky5bG7o4r3rkhQ      5
9956    14  P2kVk4cIWyK4e4h14RhK-Q      5
9980     6  xZvRLPJ1ixhFVomkXSfXAw      5
9981    10  fczQCSmaWF78toLEmb0Zsw      5

[282 rows x 3 columns]


In [8]:
dataset[['business_id','user_id','cool','stars']][(dataset['cool']>5)&(dataset['stars']>3)]

# applico un filtro per selezionare le righe con 'cool>5' e 'stars>3' e mi faccio restituire solo le colonne
# che mi interessano

Unnamed: 0,business_id,user_id,cool,stars
6,zp713qNhx8d9KCJJnrw1xA,wFweIWhv2fREZV_dYkz_1g,7,5
82,d7_jkx0VPx3uHsUl18iHMQ,mfvezpz6ohS0NQk3DZdvqQ,11,5
112,StiQ_lcCY8sX4JI-J6Mufg,QKW7sYPWPSsIcWqSiDzChQ,6,5
125,rZbHg4ACfN3iShdsT47WKQ,C6IOtaaYdLIT5fWd7ZYIuA,8,5
144,zp713qNhx8d9KCJJnrw1xA,dQO0tQISZyb9L4d5ASnXyQ,16,5
...,...,...,...,...
9904,3n9mSKySEv3G03YjcU-YOQ,eBwBjylS66qPcHs2_ajLag,13,4
9942,iV7D7fHKb-bF9fCL_bEMtA,l81ILmOhky5bG7o4r3rkhQ,8,5
9956,7tPe20uDErh-iSkfNEWzVQ,P2kVk4cIWyK4e4h14RhK-Q,14,5
9980,GHYOl_cnERMOhkCK_mGAlA,xZvRLPJ1ixhFVomkXSfXAw,6,5


Altre operazioni

In [9]:
# Conteggio delle righe risultato dall'applicazione del filtro
print(dataset[['business_id','user_id','cool','stars']][(dataset['cool']>5)&(dataset['stars']>3)].count())

business_id    216
user_id        216
cool           216
stars          216
dtype: int64


In [10]:
# Statistiche sul dataset risultante dall'applicazione del filtro
print(dataset[['business_id','user_id','cool','stars']][(dataset['cool']>5)&(dataset['stars']>3)].describe())

             cool       stars
count  216.000000  216.000000
mean     9.986111    4.481481
std      6.513299    0.500818
min      6.000000    4.000000
25%      6.000000    4.000000
50%      8.000000    4.000000
75%     11.000000    5.000000
max     77.000000    5.000000


In [11]:
dataset['stars'][(dataset['stars']>4)|(dataset['stars']<2)].count()

# Sto dicendo: conta le righe dove 'stars'>4 e quelle dove 'stars < 2'

4086

In [12]:
# Facciamo la verifica di quanto detto sopra

dataset[dataset['stars']>4].count() + dataset[dataset['stars']<2].count() == 4086

business_id    True
date           True
review_id      True
stars          True
text           True
type           True
user_id        True
cool           True
useful         True
funny          True
dtype: bool

### Sorting

Quando andiamo ad applicare un filtro sul dataset i risultati vengono di deafult ordinati per indice

In [13]:
dataset[dataset['cool']==5].head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
18,b5cEoKR8iQliq-yT2_O0LQ,2009-03-06,v0cTd3PNpYCkTyGKSpOfGA,3,I met a friend for lunch yesterday. \n\nLoved ...,review,UsULgP4bKA8RMzs8dQzcsA,5,6,4
218,h6jfMpTZpNduLG0wE2tbaw,2008-12-21,lG3Y6LPASX8MyqDedIpBGQ,2,Ate here months ago. Was excited to finally f...,review,0bNXP9quoJEgyVZu9ipGgQ,5,7,3
250,PWG28q4JFOc8FiRBjnfCkA,2012-10-24,W5AdcbX_qzSYQ5VbwWyJsQ,3,"Alright, so here is my two cents on Brat Haus....",review,lmiDCrmas8TxRsbIGZX9Pg,5,6,3
409,bbi-76L0uvQRrno57Qjpaw,2012-11-28,wC9w5m_ROb0dLWMqVXjbDw,4,"Handmade, in-store fine candies aand chocolate...",review,EcgpGtxB5916NPqvXwMcjQ,5,10,2
452,eTbtfgUtfewdncgVf2CbWw,2007-03-22,wAL9qe2PCJORCcAoCL7Gvw,5,WOW this place is good! SO good! And not jus...,review,l53FUDHRHLg7BQ89KgAtxQ,5,5,0


Quello che si vede nella cella soprastante si ottiene perchè viene applicato di default il metodo *sort_index* della classe
Dataframe.

In [14]:
dataset[dataset['cool']==5].sort_index().head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
18,b5cEoKR8iQliq-yT2_O0LQ,2009-03-06,v0cTd3PNpYCkTyGKSpOfGA,3,I met a friend for lunch yesterday. \n\nLoved ...,review,UsULgP4bKA8RMzs8dQzcsA,5,6,4
218,h6jfMpTZpNduLG0wE2tbaw,2008-12-21,lG3Y6LPASX8MyqDedIpBGQ,2,Ate here months ago. Was excited to finally f...,review,0bNXP9quoJEgyVZu9ipGgQ,5,7,3
250,PWG28q4JFOc8FiRBjnfCkA,2012-10-24,W5AdcbX_qzSYQ5VbwWyJsQ,3,"Alright, so here is my two cents on Brat Haus....",review,lmiDCrmas8TxRsbIGZX9Pg,5,6,3
409,bbi-76L0uvQRrno57Qjpaw,2012-11-28,wC9w5m_ROb0dLWMqVXjbDw,4,"Handmade, in-store fine candies aand chocolate...",review,EcgpGtxB5916NPqvXwMcjQ,5,10,2
452,eTbtfgUtfewdncgVf2CbWw,2007-03-22,wAL9qe2PCJORCcAoCL7Gvw,5,WOW this place is good! SO good! And not jus...,review,l53FUDHRHLg7BQ89KgAtxQ,5,5,0


Tuttavia possiamo anche decidere di ordinare i risultati utilizzando un altro indice a nostra scelta col metodo *sort_values*

In [15]:
dataset[dataset['cool']==5].sort_values('funny').head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
1823,IVc23uY-36WUNYoIbz42Fg,2008-01-13,Kibj2ajzr5yyNrEkpIk_BQ,4,I absolutely love the atmosphere and vibe of S...,review,-txH2zJSBZQHO6RWvoWXuQ,5,5,0
5795,xPbUPBWS6k31qgUfUODmnw,2009-08-04,GPRCZ2tVtGLXYUDWTYYIhg,5,You know when you get a really good cut and co...,review,q9XgOylNsSbqZqF_SO3-OQ,5,7,0
2215,whAFYa0ZyTIRe7GamYAmsw,2010-04-03,qm7XZ40KNqh2cJTlnXy91g,4,"Went for the free ""Movies in the Park Free Fam...",review,hq0WUfau1Mh3GyHn8oVdNQ,5,4,0
1203,o2rrTbAxGXrdQKQvKDcGiA,2008-03-12,5KNRveSLv3bPAsyc-32JHg,5,I would not survive without Channel 8. i reall...,review,90a6z--_CUrl84aCzZyPsg,5,3,0
5252,NH67MdKaFGNcP-dlu56pyw,2009-01-16,OUYW_ylJmTRZFRtTEMzAbQ,4,"Impressive dinner tonight! However, it was a ...",review,0bNXP9quoJEgyVZu9ipGgQ,5,6,0


### Null values

Ora concentriamoci sui valori mancanti ('**NULL**') del dataset.

Tali celle sono riempite col valore '**NaN**'.

In [16]:
dataset.isnull().describe()  # non ci sono valori nulli, quindi proviamo a modificare i dataset eliminando qualche valore

# La funzione isnull() restituisce True laddove trova la cella vuota

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
count,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
unique,1,1,1,1,1,1,1,1,1,1
top,False,False,False,False,False,False,False,False,False,False
freq,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000


In [17]:
dataset_modified = pd.read_csv('yelp_modified.csv',index_col=None)
dataset_modified.head(22)

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5.0,,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5.0,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5.0,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0.0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4.0,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1.0,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5.0,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2.0,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5.0,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0.0,0
5,-yxfBYGB6SEqszmxJxd97A,2007-12-13,m2CKSsepBCoRYWxiRUsxAg,4.0,"Quiessence is, simply put, beautiful. Full wi...",review,sqYN3lNgvPbPCTRsMFu27g,4,3.0,1
6,zp713qNhx8d9KCJJnrw1xA,2010-02-12,riFQ3vxNpP4rWLk_CSri2A,5.0,Drop what you're doing and drive here. After I...,review,wFweIWhv2fREZV_dYkz_1g,7,7.0,4
7,hW0Ne_HTHEAgGF1rAdmR-g,2012-07-12,JL7GXJ9u4YMx7Rzs05NfiQ,4.0,"Luckily, I didn't have to travel far to make m...",review,1ieuYcKS7zeAv_U15AB13A,0,1.0,0
8,wNUea3IXZWD63bbOQaOH-g,,XtnfnYmnJYi71yIuGsXIUA,,Definitely come for Happy hour! Prices are ama...,review,Vh_DlizgGhSqQh4qfZ2h6A,0,0.0,0
9,nMHhuYan8e3cONo3PornJA,2010-08-11,jJAIXA46pU1swYyRCdfXtQ,5.0,Nobuo shows his unique talents with everything...,review,sUNkXg8-KFtCMQDV6zRzQg,0,1.0,0


In [18]:
dataset_modified.isnull().describe()    # Come si vede, ci sono dei valori nulli

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
count,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
unique,2,2,1,2,2,1,1,1,2,1
top,False,False,False,False,False,False,False,False,False,False
freq,9999,9999,10000,9998,9998,10000,10000,10000,9999,10000


In [19]:
dataset_modified['business_id'].isnull().describe()   # ad esempio si vede che nella colonna business_id c'è 1 valore nullo

count     10000
unique        2
top       False
freq       9999
Name: business_id, dtype: object

In [20]:
dataset_modified[['business_id','text','user_id']][dataset_modified['business_id'].isnull()]

# così facendo sto filtrando le righe dove il campo 'business_id' non è valorizzato

Unnamed: 0,business_id,text,user_id
21,,This place shouldn't even be reviewed - becaus...,YN3ZLOdg8kpnfbVcIhuEZA


In [21]:
dataset_modified[['business_id','text','user_id','useful']][dataset_modified['useful'].isnull()]

# così facendo sto filtrando le righe dove il campo 'useful' non risulta valorizzato

Unnamed: 0,business_id,text,user_id,useful
15,Vb9FPCEL6Ly24PNxLBaAFw,Was it worth the 21$ for a salad and small piz...,ylWOj2y7TV2e3yYeWhu2QA,


Possiamo anche usare il metodo *fillna* (disponibile per classe Series e classe Dataframe) per riempire 
le celle non valorizzate.

In [22]:
dataset_modified = dataset_modified[['business_id','text','user_id','useful']].fillna('No Value!').head(16)

In [23]:
dataset_modified.head(16)

Unnamed: 0,business_id,text,user_id,useful
0,9yKzy9PApeiPPOUJEtnvkg,No Value!,rLtl8ZkDX5vH5nAx9C3q5Q,5
1,ZRJwVLyzEJq1VAihDhYiow,I have no idea why some people give bad review...,0a2KyEL0d3Yb1V6aivbIuQ,0
2,6oRAC4uyJCsJl1X0WZpVSA,love the gyro plate. Rice is so good and I als...,0hT2KtfLiobPvh6cDC8JQg,1
3,_1QQZuf4zZOyFCvXc0o6Vg,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",uZetl9T0NcROGOyFfughhg,2
4,6ozycU1RpktNG2-1BroVtw,General Manager Scott Petello is a good egg!!!...,vYmM4KTsC8ZfQBg-j5MWkw,0
5,-yxfBYGB6SEqszmxJxd97A,"Quiessence is, simply put, beautiful. Full wi...",sqYN3lNgvPbPCTRsMFu27g,3
6,zp713qNhx8d9KCJJnrw1xA,Drop what you're doing and drive here. After I...,wFweIWhv2fREZV_dYkz_1g,7
7,hW0Ne_HTHEAgGF1rAdmR-g,"Luckily, I didn't have to travel far to make m...",1ieuYcKS7zeAv_U15AB13A,1
8,wNUea3IXZWD63bbOQaOH-g,Definitely come for Happy hour! Prices are ama...,Vh_DlizgGhSqQh4qfZ2h6A,0
9,nMHhuYan8e3cONo3PornJA,Nobuo shows his unique talents with everything...,sUNkXg8-KFtCMQDV6zRzQg,1


### String Operations

Esempio di ricerca di una stringa.

In [24]:
print(dataset['business_id'][dataset['business_id']== '6oRAC4uyJCsJl1X0WZpVSA'].count())
dataset[dataset['business_id'] == '6oRAC4uyJCsJl1X0WZpVSA']

12


Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
543,6oRAC4uyJCsJl1X0WZpVSA,2012-03-22,6Gb_wp8N0TFXcYDkuRyWOQ,5,The best gyro you will have in Arizona...\nor ...,review,QUTW6GQnEAJSqrzoUJ0m7Q,0,0,0
854,6oRAC4uyJCsJl1X0WZpVSA,2011-07-08,qdmWiBaolhOnibLQh5W_Gg,5,My husband and I have been going here for a fe...,review,sbNbxmGNodDJukgSXBFD4Q,0,0,0
923,6oRAC4uyJCsJl1X0WZpVSA,2010-05-01,j2Eno7wxttUkoeClhctwkQ,4,"The place is nothing special, you can even thi...",review,LqgGgWi3FLHBViX9tmZ9sw,1,1,0
1052,6oRAC4uyJCsJl1X0WZpVSA,2012-07-24,NiqLX8pW-R2OrnjUxgyXkQ,5,I had the pleasure of living across the street...,review,XpSZrY_Ym8GGx7SNEd0q9g,2,3,2
1498,6oRAC4uyJCsJl1X0WZpVSA,2010-04-20,MDtjJub6L_2dgkSxF1lipA,5,This place is a gem. The food is amazing. It's...,review,YEVokKX9G9PgHJ_gK4gO_g,1,1,0
2455,6oRAC4uyJCsJl1X0WZpVSA,2010-05-17,XABPPSphqJ4PVrZbZ_AjtA,4,Haji-Baba is unique by being both a restaurant...,review,kq5Pdsy8Znyh9KEkxWT_QA,0,0,0
3014,6oRAC4uyJCsJl1X0WZpVSA,2011-02-12,UcnkWyPd2MmLSjtmk-W0Kw,4,"Go for the food and you won't be disappointed,...",review,jXp0eFDh_LNO8JfcGlV3pw,2,2,0
6405,6oRAC4uyJCsJl1X0WZpVSA,2012-04-20,ewd-GXTUEC5yaOndM0_sgw,4,So i'm not here to buy spices or shop for shee...,review,GRgBu4K7GOb3354esp_xkg,1,4,3
7925,6oRAC4uyJCsJl1X0WZpVSA,2011-07-28,u9aMNOK1_eVF8PL_CpVSrg,5,Love love love this place. Tasty with great p...,review,fPHLPrymsyb6WSFFKoMrTQ,3,4,3


Posso anche richiamare, per gli oggetti Series, i metodi propri della classe Stringa ('str').

In [25]:
dataset['business_id'].str.startswith('6o').head()

0    False
1    False
2     True
3    False
4     True
Name: business_id, dtype: bool

In [26]:
dataset['business_id'].str.endswith('VSA').head()

0    False
1    False
2     True
3    False
4    False
Name: business_id, dtype: bool

Attraverso il metodo rsplit prendo la Series dataset['date'] e la trasformo in una Series Di liste contenenti
anno, mese, giorno.

In [27]:
dataset['date'].str.rsplit('-').head()

0    [2011, 01, 26]
1    [2011, 07, 27]
2    [2012, 06, 14]
3    [2010, 05, 27]
4    [2012, 01, 05]
Name: date, dtype: object

In [28]:
print(type(dataset['date'].str.rsplit('-').head()))           # Series
print(type(dataset['date'].str.rsplit('-').head()[0]))        # List
print(type(dataset['date'].str.rsplit('-').head()[0][0]))     # Stringa
print(dataset['date'].str.rsplit('-').head()[0])
print(dataset['date'].str.rsplit('-').head()[0][2])

<class 'pandas.core.series.Series'>
<class 'list'>
<class 'str'>
['2011', '01', '26']
26


### Value_counts

Col metodo *value_counts* ho la distribuzione di frequenza dei valori presenti in una certa colonna; si tratta di un metodo della classe Series.

In [29]:
print(type(dataset['stars'].value_counts()))

dataset['stars'].value_counts()

# Si vede quante righe hanno 5 stars, quante 4 ecc

<class 'pandas.core.series.Series'>


4    3526
5    3337
3    1461
2     927
1     749
Name: stars, dtype: int64

In [30]:
dataset['stars'].value_counts().cumsum()    # distribuzione di frequenza cumulata

4     3526
5     6863
3     8324
2     9251
1    10000
Name: stars, dtype: int64

### Groupby

Quello che abbiamo fatto pocanzi col metodo count_values() può essere fatto anche col metodo *groupby*; è un metodo della classe Dataframe di Pandas.


In [31]:
print(dataset.groupby(['stars']))   # questo metodo ritorna un oggetto di tipo 'DataFrameGroupBy', il quale a sua volta dispone
                                    # di vari metodi, fra cui il size()

dataset.groupby(['stars']).size()   # ritorna il conteggio di righe per ciascuno dei valori di 'stars'

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014FC6586408>


stars
1     749
2     927
3    1461
4    3526
5    3337
dtype: int64

Il groupby può essere eseguito anche su più condizioni.

In [32]:
print(dataset.groupby(['type','stars']).size())

print(dataset.groupby(['type','stars','cool']).size().head())

type    stars
review  1         749
        2         927
        3        1461
        4        3526
        5        3337
dtype: int64
type    stars  cool
review  1      0       549
               1       118
               2        29
               3        24
               4        10
dtype: int64


Possiamo anche specificare un filtro sul groupby (a monte del groupby).

In [33]:
print(dataset[dataset['stars']==5].groupby(['type','cool']).size())
# Succede quanto segue: anzitutto filtro i risultati sulle righe ove il campo 'stars'=5, dopodichè eseguo una groupby
# dapprima sul campo 'type', poi sul campo 'cool'

type    cool
review  0       2072
        1        660
        2        265
        3        131
        4         62
        5         43
        6         26
        7         13
        8         14
        9          7
        10        11
        11         7
        12         3
        13         3
        14         6
        15         1
        16         3
        17         4
        19         1
        23         1
        27         1
        28         1
        32         1
        77         1
dtype: int64
