In [1]:
# Import dependencies.
import pandas as pd

In [2]:
# Load the csv file.
file = 'Data/Vine_Table.csv'

df = pd.read_csv(file)

print(df.shape)
df.head(10)

(3105520, 6)


Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,RQ58W7SMO911M,4.0,2.0,3.0,N,N
1,RF6IUKMGL8SF,3.0,5.0,5.0,N,N
2,R1DOSHH6AI622S,4.0,1.0,22.0,N,N
3,RATOTLA3OF70O,5.0,2.0,2.0,N,N
4,R1TNWRKIVHVYOV,4.0,0.0,2.0,N,N
5,R2F53LI9KK9MOY,4.0,2.0,2.0,N,N
6,R1KJ6MB7MRSQFF,4.0,9.0,11.0,N,N
7,R2XIM9LT335WHE,4.0,3.0,10.0,N,N
8,R1VE0FQQ0QTQJN,5.0,16.0,20.0,N,Y
9,R1VKEE2NWSWDRU,4.0,0.0,0.0,N,N


In [3]:
# Count how many reviews are by vine or non-vine members.
df['vine'].value_counts()

N    3105513
Y          2
Name: vine, dtype: int64

In [4]:
# Remove rows with total_votes less than 20 so only the most helpful reviews remain.
helpful_df = df[df['total_votes'] >= 20]

print(helpful_df.shape)
helpful_df.head()

(501609, 6)


Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
2,R1DOSHH6AI622S,4.0,1.0,22.0,N,N
8,R1VE0FQQ0QTQJN,5.0,16.0,20.0,N,Y
16,R1AABFZH0J0C0C,3.0,17.0,35.0,N,N
17,R3SJYLG07EHNE9,2.0,16.0,34.0,N,N
22,R3MD5TZJGIPTDU,5.0,21.0,31.0,N,N


In [5]:
# Count how many remaining reviews are by vine or non-vine members.
helpful_df['vine'].value_counts()

N    501609
Name: vine, dtype: int64

#### All of the remaining review are from non-vine members. We do not want to exclude vine members, so instead we will look for rows with total_votes greater than or equal to 1, to avoid dividing by zero.

In [6]:
# Remove rows with total_votes less than 20 so only the most helpful reviews remain.
helpful_df = df[df['total_votes'] >= 1]

print(helpful_df.shape)
helpful_df.head()

(2789333, 6)


Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,RQ58W7SMO911M,4.0,2.0,3.0,N,N
1,RF6IUKMGL8SF,3.0,5.0,5.0,N,N
2,R1DOSHH6AI622S,4.0,1.0,22.0,N,N
3,RATOTLA3OF70O,5.0,2.0,2.0,N,N
4,R1TNWRKIVHVYOV,4.0,0.0,2.0,N,N


In [7]:
# Count how many remaining reviews are by vine or non-vine members.
helpful_df['vine'].value_counts()

N    2789331
Y          2
Name: vine, dtype: int64

In [8]:
# Rows where the majority of the votes are labaled as 'helpful'.
mostly_helpful_df = helpful_df[helpful_df['helpful_votes'] / helpful_df['total_votes'] >= 0.50]

print(mostly_helpful_df.shape)
mostly_helpful_df.head()

(2325564, 6)


Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,RQ58W7SMO911M,4.0,2.0,3.0,N,N
1,RF6IUKMGL8SF,3.0,5.0,5.0,N,N
3,RATOTLA3OF70O,5.0,2.0,2.0,N,N
5,R2F53LI9KK9MOY,4.0,2.0,2.0,N,N
6,R1KJ6MB7MRSQFF,4.0,9.0,11.0,N,N


In [9]:
# Count how many remaining reviews are by vine or non-vine members.
mostly_helpful_df['vine'].value_counts()

N    2325562
Y          2
Name: vine, dtype: int64

In [10]:
# Create DataFrame for only non-vine reviews.
no_vine_df = mostly_helpful_df[mostly_helpful_df['vine'] == 'N']

print(no_vine_df.shape)
no_vine_df.head(10)

(2325562, 6)


Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,RQ58W7SMO911M,4.0,2.0,3.0,N,N
1,RF6IUKMGL8SF,3.0,5.0,5.0,N,N
3,RATOTLA3OF70O,5.0,2.0,2.0,N,N
5,R2F53LI9KK9MOY,4.0,2.0,2.0,N,N
6,R1KJ6MB7MRSQFF,4.0,9.0,11.0,N,N
8,R1VE0FQQ0QTQJN,5.0,16.0,20.0,N,Y
11,R21SYDQ70ILUC0,5.0,1.0,1.0,N,Y
12,R7M06Z88PD7SX,4.0,14.0,14.0,N,N
13,RRS38KZ4WB5O2,5.0,1.0,1.0,N,N
14,R2YDYRSLGNHPHR,4.0,16.0,19.0,N,Y


In [11]:
# Create DataFrame for only vine reviews.
vine_df = mostly_helpful_df[mostly_helpful_df['vine'] == 'Y']

print(vine_df.shape)
vine_df.head(10)

(2, 6)


Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
59102,R2NKNL4WXWIWBG,4.0,3.0,3.0,Y,Y
1198651,RAIG9ZQDMBF5P,5.0,3.0,6.0,Y,N


In [26]:
# Count the number of 5 star ratings for non-vine reviews.
no_vine_stars = no_vine_df['star_rating']
five_stars_no_vine = no_vine_stars[no_vine_stars == 5.0]

five_star_no_vine_count = five_stars_no_vine.count()
print(f'There are {five_star_no_vine_count} 5-star non-vine member reviews.')

There are 1467616 5-star non-vine member reviews.


In [28]:
# Count the number of 5 star ratings for vine reviews.
vine_stars = vine_df['star_rating']
five_stars_vine = vine_stars[vine_stars == 5.0]

five_star_vine_count = five_stars_vine.count()
print(f'There is {five_star_vine_count} 5-star non-vine member reviews.')

There is 1 5-star non-vine member reviews.
