# Data Preprocessing

- **Section 1:** Initially the user is asked whether they want the Chicago dataset or the Las-Vegas dataset. When the user decides which dataset they want. We read in the plain text file. We then parse all the feature subsets from the text file that we will need for the classifier.
- **Section 2:** Create a data frame with all the features that we collected from the dataset about each review. Store this data frame in a .csv file.
- **Section 3:** Extract the additional feature subset from the dataset and create an additional data frame which has the additional feature subset. Store this data frame in a .csv file.

### Section 1

#### Key - Table

- R1: The mean review helpfulness over all reviews authored by the user.
- R2: The standard deviation of review helpfulness over all reviews authored by the user.
- R3: The percentage of reviews authored by the user which have received a minimum of T opinions; in this work, T = 5

In [None]:
import textstat
import re
import pickle
import pandas as pd
from sklearn.utils import resample
import statistics

In [None]:
user_decision = int(input('Enter 1 to read Chicago dataset and enter 2 to read Las-Vegas dataset'))
if user_decision == 1:
    text_file = 'chicago.txt'
else:
    text_file = 'las-vegas.txt'

In [None]:
f = open(text_file, 'r', encoding='mac_roman')
list_of_lines = []
for line in f:
    list_of_lines.append(line)
text_ = ''.join(list_of_lines)

In [None]:
regex = r'<review>\n([\s\S]*?)\n</review>'
review_count = re.findall(regex, text_)

Each review is converted to an element of a list then we use the Python pickle library to write this list of reviews directly to a text file which can be read directly as a list from the user without having to parse the file into a list again when they read it in. The file is chosen depending on the users choice of dataset.

In [None]:
if user_decision == 1:
    with open("reviews_file_chicago.txt", "wb") as fp:   #Pickling
        pickle.dump(review_count, fp)
else:
    with open("reviews_file_las-vegas.txt", "wb") as fp:   #Pickling
        pickle.dump(review_count, fp)

In [None]:
def listToString(s):  
    str1 = " " 
    return (str1.join(s))

Parse the hotel ids from the dataset, creating a list of hotel ids

In [None]:
regex = r'<hotelUrl>\n([\s\S]*?)\n</hotelUrl>'
hotel_id = []
for i in review_count:
    string = listToString(re.findall(regex, i))
    hotel_id.append(string)

Parse the reviewer ids from the dataset, creating a list of reviewer ids

In [None]:
reviewer_id = []

regex = r'<memberUrl>\n([\s\S]*?)\n</memberUrl>'

for i in review_count:
    string = listToString(re.findall(regex, i))
    reviewer_id.append(string)

Parse the helpfulness score from the dataset, creating a list in the form "X of Y" found this to be helpful

In [None]:
regex = r'\n</memberUrl>[\s\S]*?\n<helpfulness>\n(.*?)\n</helpfulness>'
helpfulness_scores = []
for i in review_count:
    helpfulness_scores.append(re.findall(regex, i))

Calculate the percentage of helpful helpfulness opnions which is the helpfulness score for the review

In [None]:
from statistics import mean
all_values = []

for item in helpfulness_scores:
    item = listToString(item)
    value = item.replace(' of ', '/')
    if value.endswith('/0'):
        all_values.append(0)
        continue
    all_values.append(eval(value))

count = 0
for i in all_values:
    helpfulness_scores[count] = i
    count += 1

average_helpfulness_score = mean(all_values)

In [None]:
def avoid_zero_division(d):
    return d==0

Creating a dictionary with the R1 values to add to the data frame later on

In [None]:
regex = r'<memberUrl>\n(.*?)\n</memberUrl>[\s\S]*?\n<helpfulness>\n(.*?)\n</helpfulness>'

R1 = dict()

all_member_urls = []
R3_data_points = []

for i in review_count:
    tuple_ = re.findall(regex, i)
    name = ((tuple_[0])[0])
    score = ((tuple_[0])[1])
    all_member_urls.append([name, score])
    R3_data_points.append([name, score])

In [None]:
values = []
for item in all_member_urls:
    name = item[1]
    value = name.replace(' of ', '/')
    if value.endswith('/0'):
        values.append(0)
    else:
        val = eval(value)
        values.append(val)

In [None]:
for i in range(len(all_member_urls)):
    all_member_urls[i][1] = values[i]

In [None]:
R1 = dict()

for i in all_member_urls:
    name = i[0]
    score = i[1]
    if name in R1:
        R1[name].append(score)
    else:
        R1.setdefault(name, [])
        R1[name].append(score)  

In [None]:
R1_results = dict()

for key in R1:
    mean_val = mean(R1[key])
    R1_results[key] = mean_val

Creating the R2 values dictionary to add to the data frame later on

In [None]:
from statistics import stdev 

R2_results = dict()

for key in R1:
    if len(R1[key]) == 1:
        mean_val = (R1[key])[0]
        R2_results[key] = mean_val
    else:
        mean_val = stdev(R1[key])
        R2_results[key] = mean_val

In [None]:
R2 = []

regex = r'<memberUrl>\n(.*?)\n</memberUrl>'

for i in review_count:
    member_name = re.search(regex, i)
    member_name = member_name[1]
    R2.append(R2_results[member_name])

In [None]:
new_R1 = dict()

for i in R3_data_points:
    name = i[0]
    score = i[1]
    if name in new_R1:
        new_R1[name].append(score)
    else:
        new_R1.setdefault(name, [])
        new_R1[name].append(score)

Creating the R3 values dictionary to add to the data frame later on

In [None]:
R3_results = dict()

percent = 0
for key in new_R1:
    list_of_vals = new_R1[key]
    count = 0
    total = 0
    for i in list_of_vals:
        numb = (i[-2:]).strip()
        if int(numb) > 4:
            count += 1
        total += 1
    percent = count/total
    R3_results[key] = percent

In [None]:
R3 = []

regex = r'<memberUrl>\n(.*?)\n</memberUrl>'

for i in review_count:
    member_name = re.search(regex, i)
    member_name = member_name[1]
    R3.append(R3_results[member_name])

In [None]:
R1 = []

regex = r'<memberUrl>\n(.*?)\n</memberUrl>'
count = 0
for i in review_count:
    member_name = re.search(regex, i)
    member_name = member_name[1]
    R1.append(R1_results[member_name])

# Social Features


- SL1: The number of reviews authored by the user.
- SL2: The mean number of reviews authored by all users.
- SL3: The standard deviation of the number of reviews authored by all users.
- SL4: The number of reviews submitted for the hotel.
- SL5: The mean number of reviews submitted for all hotels.
- SL6: The standard deviation of the number of reviews submitted for all hotels.

Below this code extracts the Socila features subset from the dataset and stores these into individual lists 
to then add to the data frame later on

In [None]:
SL1_ = dict()

for i in new_R1:
    SL1_[i] = len(new_R1[i])  

In [None]:
SL1 = []

regex = r'<memberUrl>\n(.*?)\n</memberUrl>'

for i in review_count:
    member_name = re.search(regex, i)
    member_name = member_name[1]
    SL1.append(SL1_[member_name])

In [None]:
SL2 = 0
total = []

for key in SL1:
    total.append(SL1[key])

SL2 = mean(total)
SL3 = stdev(total)

In [None]:
regex = r'<hotelUrl>\n(.*?)\n</hotelUrl>'

SL4_ = dict()

for i in review_count:
    hotel_name = re.search(regex, i)
    hotel_name = hotel_name[1]
    if hotel_name in SL4_:
        count = SL4_[hotel_name]
        SL4_[hotel_name] = (count+1)
    else:
        SL4_[hotel_name] = 1

In [None]:
SL4 = []

regex = r'<hotelUrl>\n(.*?)\n</hotelUrl>'

for i in review_count:
    member_name = re.search(regex, i)
    member_name = member_name[1]
    SL4.append(SL4_[member_name])

In [None]:
SL5 = 0
SL6 = 0

total = []

for key in SL4:
    total.append(SL4[key])
SL5 = mean(total)
SL6 = stdev(total)

# Sentiment Features

- ST1: The score assigned by the user to the hotel.
- ST2: The number of (optional) sub-scores assigned by the user.
- ST3: The mean sub-score assigned by the user.
- ST4: The standard deviation of the sub-scores assigned by the user.
- ST5: The mean score over all reviews authored by the user.
- ST6: The standard deviation of the scores over all reviews authored by the user.
- ST7: The mean score assigned by all users to the hotel.
- ST8: The standard deviation of scores assigned by all users to the hotel.

Below this code extracts the Sentiment features subset from the dataset and stores these into individual lists to then add to the data frame later on

In [None]:
j = 0

regex = r'<rating>\n([\s\S]*?)\n</rating>'

ratings = []

for i in review_count:
    rating = re.findall(regex, i)
    ratings.append(int(rating[0]))

In [None]:
value_regex = r'<value>\n(.*?)\n</value>'
rooms_regex = r'<rooms>\n(.*?)\n</rooms>'
location_regex = r'</rooms>\n<location>\n(.*?)\n</location>'
cleanliness_regex = r'<cleanliness>\n(.*?)\n</cleanliness>'
frondesk_regex = r'<checkInFrontDesk>\n(.*?)\n</checkInFrontDesk>'
service_regex = r'<service>\n(.*?)\n</service>'
bizness_regex = r'<businessService>\n(.*?)\n</businessService>'

ST2 = []

for i in review_count:
    count = 0
    if re.search(value_regex, i):
        val = re.search(value_regex, i)[1]
        if val != 'null':
            count+=1
    if re.search(rooms_regex, i):
        rooms = re.search(rooms_regex, i)[1]
        if rooms != 'null':
            count+=1
    if re.search(location_regex, i):
        location = re.search(location_regex, i)[1]
        if location != 'null':
            count+=1
    if re.search(cleanliness_regex, i):
        clean = re.search(cleanliness_regex, i)[1]
        if clean != 'null':
            count+=1
    if re.search(frondesk_regex, i):
        frontdesk = re.search(frondesk_regex, i)[1]
        if frontdesk != 'null':
            count+=1
    if re.search(bizness_regex, i):
        bizness = re.search(bizness_regex, i)[1]
        if bizness != 'null':
            count+=1
    if re.search(service_regex, i):
        service = re.search(service_regex, i)[1]
        if service != 'null':
            count+=1
    
    ST2.append(count)     

In [None]:
ST3 = mean(ST2)
ST4 = stdev(ST2)

In [None]:
regex = r'<memberUrl>\n(.*?)\n</memberUrl>\n[\s\S]*?\n<rating>\n(.*?)\n</rating>'

member_score = dict()
member_stdev = dict()

for i in review_count:
    regexp = re.findall(regex, i)
    name = regexp[0][0]
    rating = int(regexp[0][1])
    if name in member_score:
        member_score[name].append(rating)
    else:
        member_score[name] = [rating]

member_stdev = member_score

In [None]:
member_mean = dict()

for i in member_score:
    member_mean[i] = mean(member_score[i])   

In [None]:
for i in member_score:
    if len(member_score[i]) < 2:
        member_stdev[i] = member_score[i]
    else:
        member_stdev[i] = stdev(member_score[i])

In [None]:
ST5 = []

regex = r'<memberUrl>\n(.*?)\n</memberUrl>'
count = 0
for i in review_count:
    member_name = re.search(regex, i)
    member_name = member_name[1]
    ST5.append(member_mean[member_name])

In [None]:
ST6 = []

regex = r'<memberUrl>\n(.*?)\n</memberUrl>'
count = 0
for i in review_count:
    member_name = re.search(regex, i)
    member_name = member_name[1]
    val = member_stdev[member_name]
    if isinstance(val, list):
        ST6.append(val[0])
    else:
        ST6.append(val)

In [None]:
ST7 = []

regex = r'<hotelRatingHistogram>\n(.*?)\n</hotelRatingHistogram>'
count_regex = r'<numberHotelRatings>\n(.*?)\n</numberHotelRatings>'
standard_dev = []

for i in review_count:
    nums = re.search(regex, i)[1]
    count = re.search(count_regex, i)[1]
    list_nums = nums.split(',')
    total = (int(list_nums[0])*5)+(int(list_nums[1])*4)+(int(list_nums[2])*3)+(int(list_nums[3])*2)+(int(list_nums[4])*1)
    mean = total/int(count)
    ST7.append(mean)
    all_nums = []
    for five in range(int(list_nums[0])):
        all_nums.append(5)
    for four in range(int(list_nums[1])):
        all_nums.append(4)
    for three in range(int(list_nums[2])):
        all_nums.append(3)
    for two in range(int(list_nums[3])):
        all_nums.append(2)
    for one in range(int(list_nums[4])):
        all_nums.append(1)
    if len(all_nums) < 2:
        standard_dev.append(all_nums)
    else:
        standard_dev.append(stdev(all_nums))

In [None]:
ST8 = []
for i in standard_dev:
    if isinstance(i, list):
        ST8.append(i[0])
    else:
        ST8.append(i)

# Content Features

- C1: The number of terms in the review text.
- C2: The ratio of uppercase and lowercase characters to other characters in the review text.
- C3: The ratio of uppercase to lowercase characters in the review text.
- C4: Review completeness (a) – an integer in the range [0,2] which captures whether the user has completed one, both or none of the optional liked and disliked parts of the review (see Section 2.1).
- C5: Review completeness (b) – the number of optional personal and purpose of visit details that are provided by the user in the review (see Section 2.1).
- C6: Review completeness (c) – the number of optional review-template questions that are answered in the review (see Section 2.1).


Below this code extracts the Content features subset from the dataset and stores these into individual lists to then add to the data frame later on

In [None]:
text_list = []
C1 = []

text_regex = r'<reviewText>\n(.*?)</p>\n</reviewText>'

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        new_text = re.sub('<br/>', '', text[1])
        new_text = re.sub('&amp;', ' ', new_text)
        new_text = re.sub('&quot;', ' ', new_text)

        text_list = new_text.split(' ')
        C1.append(len(text_list)) 
    else:
        C1.append(0)

In [None]:
import string
C2 = []
count = lambda l1,l2: sum([1 for x in l1 if x in l2])

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        text = text[1]
        lower_count = sum(1 for c in text if c.islower())
        upper_count = sum(1 for c in text if c.isupper())
        punc_count = count(text,set(string.punctuation)) 
        ans = punc_count/(lower_count+upper_count)
        C2.append(ans)
    else:
        C2.append(0)

In [None]:
C3 = []
count = lambda l1,l2: sum([1 for x in l1 if x in l2])

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        text = text[1]
        lower_count = sum(1 for c in text if c.islower())
        upper_count = sum(1 for c in text if c.isupper())
        ans = (upper_count/lower_count)
        C3.append(ans)
    else:
        C3.append(0)

In [None]:
C4 = []

liked_regex = r'<liked>\n(.*?)\n</liked>'
disliked_regex = r'<disliked>\n(.*?)\n</disliked>'

for i in review_count:
    count = 0
    liked = re.search(liked_regex, i)
    disliked = re.search(disliked_regex, i)
    if str(liked[1]) != 'null':
        count+=1
    if str(disliked[1]) != 'null':
        count +=1
    C4.append(count) 

In [None]:
regex_stay = r'<dateOfStay>\n(.*?)\n</dateOfStay>'
regex_reason= r'<visitWasFor>\n(.*?)\n</visitWasFor>'
regex_age= r'<ageRange>\n(.*)\n</ageRange>'
regex_since= r'<memberSince>\n(.*?)\n</memberSince>'
regex_group = r'<travelingGroup>\n(.*?)\n</travelingGroup>'

C5 = []

for i in review_count:
    stay = (re.search(regex_stay, i))[1]
    reason = (re.search(regex_reason, i))[1]
    age = (re.search(regex_age, i))[1]
    since = (re.search(regex_since, i))[1]
    group = (re.search(regex_group, i))[1]
    count = 0
    if stay != 'null':
        count+=1
    if reason != 'null':
        count+=1
    if age != 'null':
        count+=1
    if since != 'null':
        count+=1
    if group != 'null':
        count+=1
    C5.append(count)


In [None]:
regex_qs = r'<numQuestions>\n(.*?)\n</numQuestions>'

C6 = []

for i in review_count:
    qs = (re.search(regex_qs, i))[1]
    
    C6.append(int(qs))

### Section 2

- Creating dataframe

Here we create the data frame using all the feature subsets we created above.

In [None]:
df = pd.DataFrame(list(zip(C1, C2, C3, C4, C5, C6)), columns =['C1', 'C2', 'C3', 'C4', 'C5', 'C6'])

In [None]:
df = pd.DataFrame(list(zip(R1, R2, R3, C1, C2, C3, C4, C5, C6, SL1, SL4, ratings, ST2, ST7, ST8, hotel_id, reviewer_id)),  
     columns =['R1', 'R2', 'R3','C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'SL1', 'SL4', 'ST1', 'ST2', 'ST7', 'ST8', 'hotel_id', 'reviewer_id'])

df['SL2'] = SL2
df['SL3'] = SL3
df['ST3'] = ST3
df['ST4'] = ST4
df['ST5'] = ST5
df['ST6'] = ST6
df['SL5'] = SL5
df['SL6'] = SL6

In [None]:
df['hotel_id'] = hotel_id
df['reviewer_id'] = reviewer_id
df = df.set_index(['hotel_id', 'reviewer_id'])

Here we set the helpfulness score of the data frame

In [None]:
count = 0

for i in helpfulness_scores:
    if i < 0.75:
        helpfulness_scores[count] = 0
    else:
        helpfulness_scores[count] = 1
    count += 1

df['helpful_of_not'] = helpfulness_scores

We then take the balanced sample of the data frame. Then we write this sample to a .csv file

In [None]:
df_modified = df.copy()

# Separate majority and minority classes
df_majority = df_modified[df_modified.helpful_of_not==1]
df_minority = df_modified[df_modified.helpful_of_not==0]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=12519)#,     # to match minority class
#random_state=4) 
 
# Combine minority class with downsampled majority class
df_modified = pd.concat([df_majority_downsampled, df_minority])


# Separate majority and minority classes
df_majority = df_modified[df_modified.helpful_of_not==0]
df_minority = df_modified[df_modified.helpful_of_not==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=12519)#,     # to match minority class
#random_state=4) 
 
# Combine minority class with downsampled majority class
df_modified = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_modified.helpful_of_not.value_counts()

In [None]:
if user_decision == 1:
    df.to_csv('chicago_data_frame.csv')
else:
    df.to_csv('las-vegas_data_frame.csv')

### Section 3

- Here we get the new feature subset. Add this to the data frame and then write this data frame to a .csv file.

In [None]:
def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count


In [None]:
number_of_complex_words = []

text_regex = r'<reviewText>\n(.*?)</p>\n</reviewText>'

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        new_text = re.sub('<br/>', '', text[1])
        new_text = re.sub('&amp;', ' ', new_text)
        new_text = re.sub('&quot;', ' ', new_text)
        new_text = re.sub('<p id=\"review_[\d]*?\">', '',new_text)

        text_list = re.findall(r"[\w']+", new_text)
        complex_word = 0
        count = 0
        for i in text_list:
            if len(i) > 0:
                count = syllable_count(i)
                if count > 2:
                    complex_word +=1
            else:
                complex_word += 0
        number_of_complex_words.append(complex_word)
    else:
        number_of_complex_words.append(0)


In [None]:
df['num_complex_words'] = number_of_complex_words

In [None]:
avg_syllables_per_word = []

text_regex = r'<reviewText>\n(.*?)</p>\n</reviewText>'

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        new_text = re.sub('<br/>', '', text[1])
        new_text = re.sub('&amp;', ' ', new_text)
        new_text = re.sub('&quot;', ' ', new_text)
        new_text = re.sub('<p id=\"review_[\d]*?\">', '',new_text)

        text_list = re.findall(r"[\w']+", new_text)
        count = []
        for i in text_list:
            if len(i) > 0:
                count.append(syllable_count(i))
            else:
                count.append(0)
        if len(count) > 1:
            avg_syllables_per_word.append(statistics.mean(count))
        else:
            avg_syllables_per_word.append(count)
    else:
        avg_syllables_per_word.append(0)

In [None]:
df['avg_syllables_per_review'] = avg_syllables_per_word

In [None]:
avg_words_per_sen = []

text_regex = r'<reviewText>\n(.*?)</p>\n</reviewText>'

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        new_text = re.sub('<br/>', '', text[1])
        new_text = re.sub('&amp;', ' ', new_text)
        new_text = re.sub('&quot;', ' ', new_text)
        new_text = re.sub('<p id=\"review_[\d]*?\">', '',new_text)

        text_list = re.split(r'[.?!]\s*', new_text)
        count = []
        for i in text_list:
            i = i.split(' ')
            if len(i) > 0:
                count.append(len(i))
        if len(count) > 1:
            avg_words_per_sen.append(statistics.mean(count))
        else:
            avg_words_per_sen.append(0)
    else:
        avg_words_per_sen.append(0)

In [None]:
df['avg_words_per_sen'] = avg_words_per_sen

In [None]:
gunning_fog_score = []

text_regex = r'<reviewText>\n(.*?)</p>\n</reviewText>'

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        new_text = re.sub('<br/>', '', text[1])
        new_text = re.sub('&amp;', ' ', new_text)
        new_text = re.sub('&quot;', ' ', new_text)
        new_text = re.sub('<p id=\"review_[\d]*?\">', '',new_text)
        gunning_fog_score.append(textstat.gunning_fog(new_text))

In [None]:
df['gunning_fog_score'] = gunning_fog_score

In [None]:
flesch_reading_ease_score = []

text_regex = r'<reviewText>\n(.*?)</p>\n</reviewText>'

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        new_text = re.sub('<br/>', '', text[1])
        new_text = re.sub('&amp;', ' ', new_text)
        new_text = re.sub('&quot;', ' ', new_text)
        new_text = re.sub('<p id=\"review_[\d]*?\">', '',new_text)
        flesch_reading_ease_score.append(textstat.flesch_reading_ease(new_text))


In [None]:
df['flesch_reading_ease_score'] = flesch_reading_ease_score

In [None]:
flesch_kincaid_grade_score = []

text_regex = r'<reviewText>\n(.*?)</p>\n</reviewText>'

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        new_text = re.sub('<br/>', '', text[1])
        new_text = re.sub('&amp;', ' ', new_text)
        new_text = re.sub('&quot;', ' ', new_text)
        new_text = re.sub('<p id=\"review_[\d]*?\">', '',new_text)
        flesch_kincaid_grade_score.append(textstat.flesch_kincaid_grade(new_text))


In [None]:
df['flesch_kincaid_grade_score'] = flesch_kincaid_grade_score

In [None]:
smog_index_score = []

text_regex = r'<reviewText>\n(.*?)</p>\n</reviewText>'

for i in review_count:
    text = re.search(text_regex, i)
    if text is not None:
        new_text = re.sub('<br/>', '', text[1])
        new_text = re.sub('&amp;', ' ', new_text)
        new_text = re.sub('&quot;', ' ', new_text)
        new_text = re.sub('<p id=\"review_[\d]*?\">', '',new_text)
        smog_index_score.append(textstat.smog_index(new_text))

In [None]:
df['smog_index_score'] = smog_index_score

In [None]:
df = df.reset_index()

In [None]:
df = df.set_index(['hotel_id', 'reviewer_id'])

In [None]:
# Separate majority and minority classes
df_majority = df[df.helpful_of_not==1]
df_minority = df[df.helpful_of_not==0]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=12519)#,     # to match minority class
#random_state=4) 
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.helpful_of_not.value_counts()


# Separate majority and minority classes
df_majority = df_downsampled[df_downsampled.helpful_of_not==0]
df_minority = df_downsampled[df_downsampled.helpful_of_not==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=12519)#,     # to match minority class
#random_state=4) 
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.helpful_of_not.value_counts()

In [None]:
if user_decision == 1:
    df_downsampled.to_csv('chicago_data_frame_with_more_rows.csv')
else:
    df_downsampled.to_csv('las-vegas_data_frame_with_more_rows.csv')