# Importing necessary libraries and dropping extra columns

In [1]:
import pandas as pd

In [44]:
df = pd.read_csv('Scraped_Yelp_Reviews.csv')

In [45]:
df2 = pd.read_csv('Yelp_Dataset_Reviews.csv')

In [46]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df2.drop(columns=['Unnamed: 0'], inplace=True)

In [47]:
df.head()

Unnamed: 0,Cool,Funny,Rating,Text,Useful
0,19.0,5.0,5.0 star rating,Talk about WOW. If anyone ever asked me what m...,12.0
1,14.0,3.0,5.0 star rating,Mighty Quinn's is a fast-casual barbecue chain...,21.0
2,113.0,59.0,5.0 star rating,Phenomenal Restaurant !!! We came here Monday ...,107.0
3,18.0,7.0,5.0 star rating,Did I just eat THE burger in New York City? Th...,14.0
4,14.0,6.0,5.0 star rating,So places like this really do exist.Like many ...,15.0


In [48]:
df.groupby('Rating').count()

Unnamed: 0_level_0,Cool,Funny,Text,Useful
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0 star rating,3,3,3,3
2.0 star rating,5,5,7,6
3.0 star rating,182,182,187,186
4.0 star rating,1869,1842,1873,1869
5.0 star rating,2452,2413,2459,2456


In [49]:
df2.head()

Unnamed: 0,cool,funny,stars,text,useful
0,15,10,3.0,Review #189 - 2016\n\nWe arrived at Hertz at a...,16
1,22,17,3.0,We were searching for a local Peruvian restaur...,28
2,5,8,1.0,Britney is one of the few stars that has ever ...,5
3,5,8,2.0,My gf suggested we eat here because they have ...,13
4,12,13,1.0,"Service is terrible. \n\nThere were 2 of us, t...",22


In [51]:
df2.groupby('stars').count()

Unnamed: 0_level_0,cool,funny,text,useful
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,4790,4790,4790,4790
2.0,5497,5497,5497,5497
3.0,12785,12785,12785,12785


# Based on the heads of the 2 dataframes above, I need to do the following:

### 1. Change the column names to be consistent
### 2. Drop the "star rating" part in my df

In [52]:
df2.rename(columns={'cool':'Cool', 'funny':'Funny', 'stars':'Rating', 'text':'Text', 'useful':'Useful'}, inplace=True)

In [53]:
df2.head()

Unnamed: 0,Cool,Funny,Rating,Text,Useful
0,15,10,3.0,Review #189 - 2016\n\nWe arrived at Hertz at a...,16
1,22,17,3.0,We were searching for a local Peruvian restaur...,28
2,5,8,1.0,Britney is one of the few stars that has ever ...,5
3,5,8,2.0,My gf suggested we eat here because they have ...,13
4,12,13,1.0,"Service is terrible. \n\nThere were 2 of us, t...",22


In [54]:
df['Rating']= df['Rating'].map(lambda x: x.strip('star rating'))

# Some of my reviews will have blanks for Cool, Funny, or Useful so filling them with a 0

In [56]:
df.fillna(0, inplace = True)

In [57]:
df.groupby('Rating').count()

Unnamed: 0_level_0,Cool,Funny,Text,Useful
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,3,3,3,3
2.0,7,7,7,7
3.0,187,187,187,187
4.0,1873,1873,1873,1873
5.0,2459,2459,2459,2459


# Want to make the datatypes consistent between my 2 dataframes before merge

In [58]:
df['Cool']= df['Cool'].map(lambda x: int(x))
df['Funny']= df['Funny'].map(lambda x: int(x))
df['Useful']= df['Useful'].map(lambda x: int(x))
df['Rating']= df['Rating'].map(lambda x: float(x))

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4529 entries, 0 to 4528
Data columns (total 5 columns):
Cool      4529 non-null int64
Funny     4529 non-null int64
Rating    4529 non-null float64
Text      4529 non-null object
Useful    4529 non-null int64
dtypes: float64(1), int64(3), object(1)
memory usage: 177.0+ KB


In [60]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23072 entries, 0 to 23071
Data columns (total 5 columns):
Cool      23072 non-null int64
Funny     23072 non-null int64
Rating    23072 non-null float64
Text      23072 non-null object
Useful    23072 non-null int64
dtypes: float64(1), int64(3), object(1)
memory usage: 901.3+ KB


In [61]:
df3 = pd.concat([df, df2])

In [62]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27601 entries, 0 to 23071
Data columns (total 5 columns):
Cool      27601 non-null int64
Funny     27601 non-null int64
Rating    27601 non-null float64
Text      27601 non-null object
Useful    27601 non-null int64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.3+ MB


In [63]:
df3.reset_index(inplace=True)

In [64]:
df3.tail()

Unnamed: 0,index,Cool,Funny,Rating,Text,Useful
27596,23067,5,6,2.0,"In theory this place is a fun concept, food, b...",7
27597,23068,14,13,2.0,"""Six-foot, seven-foot, eight-foot BUNCH!\nSix-...",12
27598,23069,10,9,3.0,"When one is denied gnocchi, one makes do with ...",8
27599,23070,12,8,3.0,Not sure that I like the new comp system for d...,17
27600,23071,5,11,1.0,I think this owner and the owner of Amy's Baki...,13


In [65]:
df3.groupby('Rating').count()

Unnamed: 0_level_0,index,Cool,Funny,Text,Useful
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,4793,4793,4793,4793,4793
2.0,5504,5504,5504,5504,5504
3.0,12972,12972,12972,12972,12972
4.0,1873,1873,1873,1873,1873
5.0,2459,2459,2459,2459,2459


In [66]:
df3.to_csv('Consolidated_Dataframe.csv')