In [10]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import jinja2
import os


In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### Dataset Reviews Yelp

In [3]:
# Leer el archivo JSON como un Dask DataFrame en bloques de 128MB

df_rev_ye = dd.read_json(r'.\Raw Data\Yelp\review.json', lines=True, blocksize = '128MB')



In [4]:
# Despliegue de primeros 5 registros

df_rev_ye.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [5]:
# Drop de columnas innecesarias

f_rev_ye = df_rev_ye.drop(['useful','funny','cool'], axis = 1)

In [6]:
# Obtener informacion sobre el dataset de reviews

df_rev_ye.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 9 entries, review_id to date
dtypes: datetime64[ns](1), int64(4), string(4)

In [7]:
print(df_rev_ye.dtypes)


review_id      string[pyarrow]
user_id        string[pyarrow]
business_id    string[pyarrow]
stars                    int64
useful                   int64
funny                    int64
cool                     int64
text           string[pyarrow]
date            datetime64[ns]
dtype: object


In [8]:
print(df_rev_ye.npartitions)


41


In [9]:
num_rows = df_rev_ye.shape[0].compute()
print(num_rows)


6990280


In [10]:
null_counts = df_rev_ye.isnull().sum().compute()
print(null_counts)


review_id      0
user_id        0
business_id    0
stars          0
useful         0
funny          0
cool           0
text           0
date           0
dtype: int64


In [11]:
descriptive_stats = df_rev_ye.describe().compute()
print(descriptive_stats)


              stars        useful         funny          cool  \
count  6.990280e+06  6.990280e+06  6.990280e+06  6.990280e+06   
min    1.000000e+00 -1.000000e+00 -1.000000e+00 -1.000000e+00   
25%    3.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
50%    4.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00   
75%    5.000000e+00  2.000000e+00  0.000000e+00  1.000000e+00   
max    5.000000e+00  1.182000e+03  7.920000e+02  4.040000e+02   
mean   3.748584e+00  1.184609e+00  3.265596e-01  4.986175e-01   
std    1.478705e+00  3.253767e+00  1.688729e+00  2.172460e+00   

                             date  
count                     6990280  
min           2005-02-16 03:23:22  
25%    2016-03-21 14:36:42.500000  
50%           2018-06-07 23:03:32  
75%           2020-06-30 02:02:45  
max           2022-01-19 19:48:45  
mean                         <NA>  
std                          <NA>  


### Dataset Bussines de Yelp

In [12]:
# Lectura de pkl y despliegue de primeros 5 registros

df_bs_ye = pd.read_pickle(r'.\Raw Data\Yelp\business.pkl')
df_bs_ye.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,business_id.1,name.1,address.1,city.1,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open.1,attributes.1,categories.1,hours.1
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",,,,,,,,,,,,,,,
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...",,,,,,,,,,,,,,
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...",,,,,,,,,,,,,,
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",,,,,,,,,,,,,,
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...",,,,,,,,,,,,,,


In [13]:
# Obtener informacion sobre Dataset Business

df_bs_ye.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150346 entries, 0 to 150345
Data columns (total 28 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   business_id   150346 non-null  object
 1   name          150346 non-null  object
 2   address       150346 non-null  object
 3   city          150346 non-null  object
 4   state         150343 non-null  object
 5   postal_code   150346 non-null  object
 6   latitude      150346 non-null  object
 7   longitude     150346 non-null  object
 8   stars         150346 non-null  object
 9   review_count  150346 non-null  object
 10  is_open       150346 non-null  object
 11  attributes    136602 non-null  object
 12  categories    150243 non-null  object
 13  hours         127123 non-null  object
 14  business_id   5 non-null       object
 15  name          5 non-null       object
 16  address       5 non-null       object
 17  city          5 non-null       object
 18  state         5 non-null     

In [14]:
# Eliminar columnas duplicadas manteniendo la primera instancia

df_bs_ye = df_bs_ye.loc[:,~df_bs_ye.columns.duplicated()]


In [15]:
# Convertir campo ciudades a minusculas
df_bs_ye['city'] = df_bs_ye['city'].str.lower()


In [16]:
# Despliegue de nombres de ciudades para determinar si se encuentran las 10 ciudades objetivo

list(df_bs_ye['city'].unique())


['santa barbara',
 'affton',
 'tucson',
 'philadelphia',
 'green lane',
 'ashland city',
 'brentwood',
 'st. petersburg',
 'nashville',
 "land o' lakes",
 'tampa bay',
 'indianapolis',
 'clearwater',
 'largo',
 'new orleans',
 'kenner',
 'edmonton',
 'reno',
 'newtown',
 'white house',
 'boise',
 'paoli',
 'ardmore',
 'exton',
 'wilmington',
 'edwardsville',
 'sparks',
 'alton',
 'cherry hill',
 'bala cynwyd',
 'springfield',
 'belleville',
 'carmel',
 'tampa',
 'kennett square',
 'plymouth meeting',
 'harvey',
 'west chester',
 'meridian',
 'hudson',
 'fernley',
 'williamstown',
 'pinellas park',
 'glenolden',
 'wesley chapel',
 'fishers',
 'burlington',
 'troy',
 'camden',
 'plainfield',
 'bensalem',
 'maplewood',
 'saint louis',
 'fairview heights',
 'oro valley',
 'treasure island',
 'southampton',
 'chalfont',
 'willow grove',
 'voorhees',
 'tarpon springs',
 'blue bell',
 'metairie',
 'woodbury',
 'brownsburg',
 'norristown',
 'land o lakes',
 'greenwood',
 'saint petersburg',
 '

In [17]:
city_list = [
    "atlanta",
    "boston",
    "dallas",
    "houston",
    "kansas",
    "los angeles",
    "miami",
    "new york",
    "new jersey",
    "philadelphia",
    "san francisco",
    "seattle"
]



In [18]:
# Mascara para filtrar dataset con ciudades objetivo

df_bs_ye_filtered = df_bs_ye[df_bs_ye['city'].isin(city_list)]

In [19]:
# Despliegue de resultados

list(df_bs_ye_filtered['city'].unique())

['philadelphia', 'new jersey', 'houston', 'los angeles', 'boston']

In [20]:


df_bs_ye_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14580 entries, 3 to 150336
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   business_id   14580 non-null  object
 1   name          14580 non-null  object
 2   address       14580 non-null  object
 3   city          14580 non-null  object
 4   state         14580 non-null  object
 5   postal_code   14580 non-null  object
 6   latitude      14580 non-null  object
 7   longitude     14580 non-null  object
 8   stars         14580 non-null  object
 9   review_count  14580 non-null  object
 10  is_open       14580 non-null  object
 11  attributes    13407 non-null  object
 12  categories    14571 non-null  object
 13  hours         11794 non-null  object
dtypes: object(14)
memory usage: 1.7+ MB


In [21]:
list(df_bs_ye_filtered['state'].unique())

['CA',
 'IN',
 'AZ',
 'PA',
 'NJ',
 'FL',
 'MO',
 'LA',
 'AB',
 'ID',
 'TN',
 'NV',
 'DE',
 'IL',
 'HI']

In [22]:
business_ids_of_interest = df_bs_ye_filtered['business_id'].unique()

In [23]:
df_rev_ye_filtered = df_rev_ye[df_rev_ye['business_id'].isin(business_ids_of_interest)]
df_rev_ye_filtered.head()


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31
13,8JFGBuHMoiNDyfcxuWNtrA,smOvOajNG0lS4Pq7d8g4JQ,RZtGWDLCAtuipwaZ-UfjmQ,4,0,0,0,Good food--loved the gnocchi with marinara the...,2009-10-14 19:57:14
16,oyaMhzBSwfGgemSGuZCdwQ,Dd1jQj7S-BFGqRbApFzCFw,YtSqYv1Q_pOltsVPSx54SA,5,0,0,0,Tremendous service (Big shout out to Douglas) ...,2013-06-24 11:21:25


In [24]:
num_rows = df_rev_ye_filtered.shape[0].compute()
print(num_rows)


967890


In [27]:
# Exportar el DataFrame de negocios a un archivo Parquet
df_bs_ye_filtered.to_parquet('business_filtered.parquet')

# Reparticionar el DataFrame a una sola partición
df_rev_ye_filtered = df_rev_ye_filtered.repartition(npartitions=1)

# Guardar el DataFrame reparticionado en un solo archivo Parquet
df_rev_ye_filtered.to_parquet('reviews_filtered.parquet', write_index=True)


In [13]:
# Que tama;o tiene el archivo parquet de Reviews?

tamaño_del_archivo = os.path.getsize(r'.\reviews_filtered.parquet\part.0.parquet')
tamaño_en_kb = tamaño_del_archivo / 1024
tamaño_en_mb = tamaño_en_kb / 1024
tamaño_en_mb

421.29759407043457

### Dataset User yelp

In [17]:
df_user= pd.read_parquet(r'Datasetsyelp/user.parquet')
df_user

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105592,4QGxxakRZeOlg_qDuxmTeQ,Jennilee,38,2012-01-19 23:33:02,74,9,6,,kmwNG5LZSHFmveg6wYYdrw,0,...,1,0,0,0,1,4,0,0,1,0
2105593,tmelBbVBGAzXBVfH2u_R6g,Gerry,19,2009-06-09 16:34:54,14,5,2,,"BFYdCAMFyjYHDwesndEXEg, _9fTIqfSJc7g3V_o76XRVg...",1,...,1,0,0,0,0,1,0,0,0,0
2105594,tpBznnD6uJN3m_pJubj09w,Emily,26,2013-08-13 23:18:11,4,1,2,,"bKV3ly2MuK-K1cptMrFknQ, liel18zRoSB4tEkUP7i6Cg...",0,...,0,0,0,0,1,0,0,0,0,0
2105595,Kst_srPw7GdYydMFYdCtzw,Heatheranne,25,2015-01-10 00:06:25,21,2,5,,"dzHTk52vbGtbktRm_B-wEg, fOfFLV7IbBDN6lzARaLqdg...",0,...,0,0,0,0,0,1,0,0,0,0


In [18]:
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105597 entries, 0 to 2105596
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 353.4+ MB


In [19]:
df_user.describe()

Unnamed: 0,review_count,useful,funny,cool,fans,average_stars,compliment_hot,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
count,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0,2105597.0
mean,28.46875,55.16853,22.67667,31.43013,1.913746,3.641863,2.444596,0.3968618,0.2532631,0.1858665,0.09631663,1.921464,4.124809,3.805567,3.805567,1.459133,1.499514
std,104.2146,738.8068,462.8811,637.9274,24.3651,1.159519,79.63704,14.05658,17.67698,11.70051,10.49796,61.77753,136.4953,105.7484,105.7484,36.76692,96.07754
min,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,3.0,0.0,0.0,0.0,3.88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,19.0,15.0,3.0,4.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,17473.0,206296.0,185823.0,199878.0,12497.0,5.0,25784.0,13501.0,14180.0,13654.0,12669.0,59031.0,101097.0,49967.0,49967.0,15934.0,82630.0


In [20]:
#Columnas existentes en la data
df_user.columns

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')

In [21]:
#Renombramos columnas
df_user.rename(columns={'yelping_since': 'since'}, inplace=True)
df_user.rename(columns={'average_stars': 'stars'}, inplace=True)
df_user

Unnamed: 0,user_id,name,review_count,since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2105592,4QGxxakRZeOlg_qDuxmTeQ,Jennilee,38,2012-01-19 23:33:02,74,9,6,,kmwNG5LZSHFmveg6wYYdrw,0,...,1,0,0,0,1,4,0,0,1,0
2105593,tmelBbVBGAzXBVfH2u_R6g,Gerry,19,2009-06-09 16:34:54,14,5,2,,"BFYdCAMFyjYHDwesndEXEg, _9fTIqfSJc7g3V_o76XRVg...",1,...,1,0,0,0,0,1,0,0,0,0
2105594,tpBznnD6uJN3m_pJubj09w,Emily,26,2013-08-13 23:18:11,4,1,2,,"bKV3ly2MuK-K1cptMrFknQ, liel18zRoSB4tEkUP7i6Cg...",0,...,0,0,0,0,1,0,0,0,0,0
2105595,Kst_srPw7GdYydMFYdCtzw,Heatheranne,25,2015-01-10 00:06:25,21,2,5,,"dzHTk52vbGtbktRm_B-wEg, fOfFLV7IbBDN6lzARaLqdg...",0,...,0,0,0,0,0,1,0,0,0,0


In [22]:
#Ordenamos las columnas y eliminamos las columnas que no son relevantes en el analisis
columnas= ['user_id', 'name', 'since', 'review_count', 'fans', 'stars', 'friends']
df_user= df_user[columnas]
df_user

Unnamed: 0,user_id,name,since,review_count,fans,stars,friends
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,2007-01-25 16:47:26,585,267,3.91,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA..."
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,2009-01-25 04:35:42,4333,3138,3.74,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A..."
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,2008-07-25 10:41:00,665,52,3.32,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA..."
3,SZDeASXq7o05mMNLshsdIA,Gwen,2005-11-29 04:38:33,224,28,4.27,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg..."
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,2007-01-05 19:40:59,79,1,3.54,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA..."
...,...,...,...,...,...,...,...
2105592,4QGxxakRZeOlg_qDuxmTeQ,Jennilee,2012-01-19 23:33:02,38,0,2.98,kmwNG5LZSHFmveg6wYYdrw
2105593,tmelBbVBGAzXBVfH2u_R6g,Gerry,2009-06-09 16:34:54,19,1,3.68,"BFYdCAMFyjYHDwesndEXEg, _9fTIqfSJc7g3V_o76XRVg..."
2105594,tpBznnD6uJN3m_pJubj09w,Emily,2013-08-13 23:18:11,26,0,3.85,"bKV3ly2MuK-K1cptMrFknQ, liel18zRoSB4tEkUP7i6Cg..."
2105595,Kst_srPw7GdYydMFYdCtzw,Heatheranne,2015-01-10 00:06:25,25,0,2.85,"dzHTk52vbGtbktRm_B-wEg, fOfFLV7IbBDN6lzARaLqdg..."


In [23]:
df_user

Unnamed: 0,user_id,name,since,review_count,fans,stars,friends
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,2007-01-25 16:47:26,585,267,3.91,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA..."
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,2009-01-25 04:35:42,4333,3138,3.74,"ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A..."
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,2008-07-25 10:41:00,665,52,3.32,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA..."
3,SZDeASXq7o05mMNLshsdIA,Gwen,2005-11-29 04:38:33,224,28,4.27,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg..."
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,2007-01-05 19:40:59,79,1,3.54,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA..."
...,...,...,...,...,...,...,...
2105592,4QGxxakRZeOlg_qDuxmTeQ,Jennilee,2012-01-19 23:33:02,38,0,2.98,kmwNG5LZSHFmveg6wYYdrw
2105593,tmelBbVBGAzXBVfH2u_R6g,Gerry,2009-06-09 16:34:54,19,1,3.68,"BFYdCAMFyjYHDwesndEXEg, _9fTIqfSJc7g3V_o76XRVg..."
2105594,tpBznnD6uJN3m_pJubj09w,Emily,2013-08-13 23:18:11,26,0,3.85,"bKV3ly2MuK-K1cptMrFknQ, liel18zRoSB4tEkUP7i6Cg..."
2105595,Kst_srPw7GdYydMFYdCtzw,Heatheranne,2015-01-10 00:06:25,25,0,2.85,"dzHTk52vbGtbktRm_B-wEg, fOfFLV7IbBDN6lzARaLqdg..."


In [24]:
# Guardar el DataFrame reparticionado en un solo archivo Parquet
df_user.to_parquet('user_data.parquet', compression='gzip', index=False)
