In [1]:
import random
import pandas as pd
from typing import TypeVar, List, Tuple
from sentenceQuality import sentenceQuality

# Group: Jonah Mulcrone, Brian Sung, Emmanuel Obikwelu

X = TypeVar('X') # This is a generic type to represent a data point

def split_data(data: List[X], prob: float) -> Tuple[List[X], List[X]]:
    """This function splits the data into pieces"""
    
    data = data[:]
    random.shuffle(data)
    cut = int(len(data) * prob)
    return data[:cut], data[cut:]

data = [n for n in range(1000)]               # List of values ranging from 0 to 1000
train, test = split_data(data, 0.75)

# Making sure the function does what we intend
assert len(train) == 750
assert len(test) == 250

assert sorted(train + test) == data

#####################################
#    Pair input/output variables    #
#####################################

Y = TypeVar('Y') # Represents output variables


def train_test_split(xs: List[X], 
                    ys: List[Y], 
                    test_pct: float) -> Tuple[List[X], List[X], List[Y], List[Y]]:
    
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_data(idxs, 1 - test_pct)
    
    return ([xs[i] for i in train_idxs],
            [xs[i] for i in test_idxs],
            [ys[i] for i in train_idxs],
            [ys[i] for i in test_idxs])

##########################################
#    Making sure the code works right    #
##########################################

xs = [x for x in range(1000)]
ys = [2 * x for x in xs]
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.25)


# Make sure the datasets are the right lengths
assert len(x_train) == len(y_train) == 750
assert len(x_test) == len(y_test) == 250


# Check that data points are paired correctly
assert all(y == 2 * x for x, y in zip(x_train, y_train))
assert all(y == 2 * x for x, y in zip(x_test, y_test))

#################################
#    Machine learning basics    #
#################################

def accuracy(tp: int, fp: int, fn: int, tn: int) -> float:
    correct = tp + tn
    total = tp + fp + fn + tn
    
    return correct / total

assert accuracy(70, 4930, 13930, 981070) == 0.98114

def precision(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fp)

assert precision(70, 4930, 13930, 981070) == 0.014

def recall(tp: int, fp: int, fn: int, tn: int) -> float:
    return tp / (tp + fn)
           
assert recall(70, 4930, 13930, 981070) == 0.005



# Main Method

file = pd.read_csv('Best-Selling-Book-Reviews.csv')
df = file[['review title']].head(250)

"""Split the Data into Training and Testing"""

train, test = split_data(df['review title'].values, 0.75)
print(train)

obj = sentenceQuality()

quality_scores = []

# Create Quality Scores for Reviews

for review in train:
    quality_scores.append(str(round(obj.calculateQuality(obj.calculateScores(review)), 1)))

# Create Dataframe with Reviews Mapping to Scores

df_train_scores = pd.DataFrame({
    'Review Title': train,
    'Quality': quality_scores
})

print(df_train_scores)

# Train Test Split

x_train, x_test, y_train, y_test = train_test_split(df_train_scores['Review Title'].values, df_train_scores['Quality'].values, 0.25)




Sentiment(polarity=1.0, subjectivity=0.3)
8.0
35.0
The scores for your input is [0.43, 1.0, 0.3, 0.2876249999999999]
Sentiment(polarity=1.0, subjectivity=0.3)
8.0
35.0
The final quality for your input is 0.50440625
["My granddaughter's favorite!" 'Ted Bundy, Coriolanus and The Joker'
 'It’s as if he sacrificed characters for beautiful prose.'
 '"We can live as cowards or die as riders."' 'So fun!'
 'Book in GREAT condition' 'In her words, heartbreaking and hopeful'
 'Living the Wordh' 'A Beautiful And Enlightening World War II Novel'
 'Powerful Book' 'Truth about the Word' 'Love it' 'Great Book on PTSD'
 'Pretty Good....' 'Boys' 'Carried Away to Another World' 'Great book'
 '90s flashback' 'Excellent' 'Cute little story'
 'Lords of Discipline + Dragonriders of Pern + ...steamy romance? But it works!'
 'So Good!!!' 'Such a cute book!' 'Great gift'
 'DANGER: read at your own risk' 'Funny 4th Grade Read Aloud'
 'Clear + Practical SelfHelp Book For Those Wanting to Improve Productivity'
 '