# Foundations of AI & ML
## Session 05
### CaseStudy
### Lab

**Objectives:** Create a linear regression based product rating solution.


In [1]:
import pandas as pd
data = pd.read_csv("../Datasets/amazon_reviews.csv")
print(data.describe())
data = data.dropna()
print(data.describe())

         Unnamed: 0        ratings
count  167597.00000  167597.000000
mean    83798.00000       4.356307
std     48381.23087       0.993501
min         0.00000       1.000000
25%     41899.00000       4.000000
50%     83798.00000       5.000000
75%    125697.00000       5.000000
max    167596.00000       5.000000
          Unnamed: 0        ratings
count  167504.000000  167504.000000
mean    83798.019253       4.356427
std     48380.619090       0.993334
min         0.000000       1.000000
25%     41899.750000       4.000000
50%     83795.500000       5.000000
75%    125699.250000       5.000000
max    167596.000000       5.000000


In [2]:
data.head()

Unnamed: 0.1,Unnamed: 0,reviews,ratings
0,0,I like the item pricing. My granddaughter want...,5.0
1,1,Love the magnet easel... great for moving to d...,4.0
2,2,Both sides are magnetic. A real plus when you...,5.0
3,3,Bought one a few years ago for my daughter and...,5.0
4,4,I have a stainless steel refrigerator therefor...,4.0


In [3]:
data.tail()

Unnamed: 0.1,Unnamed: 0,reviews,ratings
167592,167592,This drone is very fun and super duarable. Its...,5.0
167593,167593,This is my brother's most prized toy. It's ext...,5.0
167594,167594,This Panther Drone toy is awesome. I definitel...,5.0
167595,167595,This is my first drone and it has proven to be...,5.0
167596,167596,This is a super fun toy to have around. In our...,4.0


In [4]:
ratings = data['ratings'].values
reviews = data['reviews'].values
lengths = [len(r) for r in reviews]

In [5]:
ratings[:5], reviews[:5]

(array([5., 4., 5., 5., 4.]),
 array(['I like the item pricing. My granddaughter wanted to mark on it but I wanted it just for the letters.',
        'Love the magnet easel... great for moving to different areas... Wish it had some sort of non skid pad on bottom though...',
        "Both sides are magnetic.  A real plus when you're entertaining more than one child.  The four-year old can find the letters for the words, while the two-year old can find the pictures the words spell.  (I bought letters and magnetic pictures to go with this board).  Both grandkids liked it a lot, which means I like it a lot as well.  Have not even introduced markers, as this will be used strictly as a magnetic board.",
        'Bought one a few years ago for my daughter and she loves it, still using it today. For the holidays we bought one for our niece and she loved it too.',
        'I have a stainless steel refrigerator therefore there are not much space for my son to play with his magnet. Brought this f

In [6]:
### TEMP TEMP TEMP DEBUGGING
# ratings = ratings[:2000]
# reviews = reviews[:2000]

#### We first preprocess the data by removing all the incorrect rows (that have missing rating or reviews), unwanted columns, removing stopwords and soon.

In [7]:
import re
only_alnum = re.compile(r"[^a-z0-9]+")
## Replaces one or more occurrence of any characters other than a-z and 0-9 with a space
## This automatically replaces multiple spaces by 1 space

## The try ... except ensures that if a review is mal-formed then the review is replaced with the word ERROR
def cleanUp(s):
    return re.sub(only_alnum, " ", s.lower())

In [8]:
## We make a set for testing if a word is not useful
## sets are way faster than lists for this purpose
fluff = set([w.strip() for w in open("../Datasets/fluff.txt")])

In [9]:
## Replace words like coooooool with cool, amaaaaaazing with amaazing and so on
def dedup(s):
    return re.sub(r'([a-z])\1+', r'\1\1', s)
print(dedup("cooooool"))
print(dedup("amaaaaaazzzzing"))
print(dedup('cool'))

cool
amaazzing
cool


In [10]:
def get_useful_words(s):
    return [dedup(w) for w in cleanUp(s).split() if len(w) > 2 and w not in fluff]

In [11]:
clean_reviews = [get_useful_words(review) for review in reviews]
for i in range(5):
    print("%4d" %(len(reviews[i])), reviews[i], "\n==>", clean_reviews[i])

 100 I like the item pricing. My granddaughter wanted to mark on it but I wanted it just for the letters. 
==> ['like', 'item', 'pricing', 'granddaughter', 'mark', 'letters']
 121 Love the magnet easel... great for moving to different areas... Wish it had some sort of non skid pad on bottom though... 
==> ['love', 'magnet', 'easel', 'great', 'moving', 'wish', 'sort', 'skid', 'pad', 'bottom']
 420 Both sides are magnetic.  A real plus when you're entertaining more than one child.  The four-year old can find the letters for the words, while the two-year old can find the pictures the words spell.  (I bought letters and magnetic pictures to go with this board).  Both grandkids liked it a lot, which means I like it a lot as well.  Have not even introduced markers, as this will be used strictly as a magnetic board. 
==> ['magnetic', 'real', 'plus', 'entertaining', 'more', 'child', 'letters', 'words', 'pictures', 'words', 'spell', 'bought', 'letters', 'magnetic', 'pictures', 'board', 'grandki

In [12]:
final_reviews = list(zip(clean_reviews, ratings, lengths))
#We look at a Random sample of 10 cleaned data.
import random
for i in range(10):
    r = random.randrange(0, len(final_reviews))
    print(final_reviews[r])

(['love', 'games', 'like', 'stone', 'age', 'lords', 'waterdeep', 'tried', 'hawaii', 'game', 'bit', 'more', 'complicated', 'learn', 'first', 'bit', 'more', 'fiddly', 'lot', 'double', 'sided', 'bits', 'sort', 'worth', 'hang', 'shouldn', 'more', 'play', 'game', 'kinds', 'awesome', 'stone', 'age', 'lords', 'waterdeep', 'aren', 'first', 'meeple', 'chief', 'move', 'modular', 'board', 'instead', 'bunch', 'workers', 'like', 'regular', 'worker', 'placement', 'games', 'forms', 'wooden', 'currency', 'shells', 'money', 'feet', 'movement', 'fruit', 'wild', 'card', 'build', 'villages', 'adding', 'buildings', 'items', 'people', 'plus', 'buy', 'boats', 'visit', 'neighboring', 'islands', 'contain', 'random', 'goodies', 'bunch', 'help', 'king', 'hawaii', 'first', 'form', 'shells', 'feet', 'round', 'generosity', 'diminishes', 'means', 'supplement', 'declining', 'income', 'purchasing', 'various', 'shell', 'feet', 'producing', 'huts', 'buying', 'fruits', 'layout', 'tiles', 'stuff', 'buy', 'change', 'game',

In [13]:
clean_reviews[:2], ratings[:2]

([['like', 'item', 'pricing', 'granddaughter', 'mark', 'letters'],
  ['love',
   'magnet',
   'easel',
   'great',
   'moving',
   'wish',
   'sort',
   'skid',
   'pad',
   'bottom']],
 array([5., 4.]))

** Case-Study:** Use the list of substantive words extracted from the Review as well as the length of the original Review. Decide how you would like to Derive a feature set to predict the Rating, which is a float (1.0 to 5.0).

Remember to split the Data into training, testing and Validation sets.
1. Select 10% of the Data for testing and put it away.
2. Select 20% of the Data for Validation and 70% for Training.
3. Vary the above ratio between Validation and Testing: 30 - 60, 45 - 45, 60 - 30 and Verify the effect if any on the prediction accuracy.


Some Possibilities:

1. You can use a single feature namely, the difference between number of Positive & Negative words. 

2. You can also considering predicting the rating based on the above difference and add the length of the Review as two independent Variables.

3. You could consider the Positive Words and Negative Words as two independent Variables rather than treating their difference as single independent Variable, giving you more possibilities.


In [14]:
positive_words = pd.read_csv('../Datasets/positive-words.txt').values
negative_words=  pd.read_csv('../Datasets/negative-words.txt').values

def getwordcount(words):    
    positive_count = 0
    negative_count = 0
    neutral_count = 0
    
    total = []
    #print words
    
    for w in words:        
        if w in positive_words:
            positive_count += 1
        elif w in negative_words:
            negative_count += 1
        else:
            neutral_count += 1
            
    total.append(positive_count)
    total.append(negative_count)
#     total.append(neutral_count)
#     total.append(labels[count])
    return total

In [15]:
reviews_feature_counts = []
for i in range(len(clean_reviews)):
    if i % 100 == 0: print(i)

    # Derive feature vector with - +ve words, -ve words, len(reviews)
    reviews_feature_counts.append(getwordcount(clean_reviews[i]) + [lengths[i]]) 

reviews_feature_counts[:2]

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400
18

133000
133100
133200
133300
133400
133500
133600
133700
133800
133900
134000
134100
134200
134300
134400
134500
134600
134700
134800
134900
135000
135100
135200
135300
135400
135500
135600
135700
135800
135900
136000
136100
136200
136300
136400
136500
136600
136700
136800
136900
137000
137100
137200
137300
137400
137500
137600
137700
137800
137900
138000
138100
138200
138300
138400
138500
138600
138700
138800
138900
139000
139100
139200
139300
139400
139500
139600
139700
139800
139900
140000
140100
140200
140300
140400
140500
140600
140700
140800
140900
141000
141100
141200
141300
141400
141500
141600
141700
141800
141900
142000
142100
142200
142300
142400
142500
142600
142700
142800
142900
143000
143100
143200
143300
143400
143500
143600
143700
143800
143900
144000
144100
144200
144300
144400
144500
144600
144700
144800
144900
145000
145100
145200
145300
145400
145500
145600
145700
145800
145900
146000
146100
146200
146300
146400
146500
146600
146700
146800
146900
147000
147100
147200

[[1, 0, 100], [2, 0, 121]]

In [16]:
def train_test_split(data, TRAIN_TEST_RATIO = 0.8):
    picker = list(range(len(data)))
    random.shuffle(picker)       

    FEATURE_COLUMNS = list(range(2)) # 0, 1
    ALL_COLUMNS = FEATURE_COLUMNS + [2]

    ## Raw Data ###
#     data = data.reindex(columns = ALL_COLUMNS)
    trainMax = int(len(picker) * TRAIN_TEST_RATIO)
    train = []
    test = []
    for pick in picker[:trainMax]:
        train.append(list(data.values[pick]))         ### select 80% of data to be used as training set
    for pick in picker[trainMax:]:
        test.append(list(data.values[pick])) 
        
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)
#     print(train.head())
    return train.iloc[:, 0:-1], train.iloc[:, -1:], test.iloc[:, 0:-1], test.iloc[:, -1:]


In [17]:
import numpy as np
type(reviews_feature_counts), type(ratings)

data = []
# np.hstack((features, ratings))
for i, r in enumerate(reviews_feature_counts):
     data.append(r + [ratings[i]])
data = pd.DataFrame(data)
train_X, train_y, test_X, test_y = train_test_split(data, TRAIN_TEST_RATIO=.8)

In [18]:
# train_X, train_y, test_X, test_y
# np.where(train_y[3] == 5.0, 1, 0)
# train_y[3]
train_X.values

array([[ 11.,   4., 954.],
       [  7.,   0., 625.],
       [  8.,   0., 555.],
       ...,
       [  5.,   1., 443.],
       [  3.,   0., 117.],
       [  5.,   1., 113.]])

In [19]:
### Now that we have divided our data into training and testing set, we can apply KNN to easily deduce
### the predicted ratings (multi-class classification). But it is not very scalable for this huge dataset. 
### So, I will be applying Linear Classification to predict the ratings.

from classifiers import *

def get_linear_accuracies(train_X, train_y, test_X, test_y):
    accuracies = np.zeros((5, 5))
    # list(test_y.columns.values)
    # test_y.loc[test_y[3]==4.0]
    for i, r in enumerate([1.0, 2.0, 3.0, 4.0, 5.0]):
        train_y_binary = np.where(train_y[3] == r, 1, 0)
        for j, t_r in enumerate([1.0, 2.0, 3.0, 4.0, 5.0]):    
            test_y_binary = np.where(test_y[3] == t_r, 1, 0)
#             print(r, t_r, test_y_binary[:10])
            accuracies[i][j] = predict_and_find_accuracy('linear', train_X.values, train_y_binary, test_X.values, test_y_binary)

    return accuracies


In [20]:
train_X, train_y, test_X, test_y = train_test_split(data, TRAIN_TEST_RATIO=.7)
get_linear_accuracies(train_X, train_y, test_X, test_y)

array([[0.97214041, 0.96217066, 0.90344663, 0.77941176, 0.38241264],
       [0.97214041, 0.96125527, 0.90237204, 0.77817798, 0.38133806],
       [0.97204091, 0.96230996, 0.90358593, 0.77955106, 0.38251214],
       [0.8449017 , 0.84189684, 0.80613707, 0.72759293, 0.37644273],
       [0.10325957, 0.11032397, 0.15388442, 0.26450689, 0.65744647]])

In [21]:
train_X, train_y, test_X, test_y = train_test_split(data, TRAIN_TEST_RATIO=.5)
get_linear_accuracies(train_X, train_y, test_X, test_y)

array([[0.97229917, 0.9611472 , 0.90131579, 0.77502627, 0.38677285],
       [0.96630528, 0.95288471, 0.89355478, 0.76585634, 0.37898796],
       [0.0876994 , 0.09777677, 0.15526793, 0.26844732, 0.57144904],
       [0.97225141, 0.96203076, 0.90198443, 0.77600535, 0.38772805],
       [0.02774859, 0.03796924, 0.09801557, 0.22399465, 0.61227195]])

In [22]:
train_X, train_y, test_X, test_y = train_test_split(data, TRAIN_TEST_RATIO=.9)
get_linear_accuracies(train_X, train_y, test_X, test_y)

array([[0.97301654, 0.96370366, 0.90245358, 0.77619247, 0.38463375],
       [0.96871829, 0.95618172, 0.89600621, 0.76819294, 0.37723121],
       [0.97301654, 0.96370366, 0.90245358, 0.77619247, 0.38463375],
       [0.74759716, 0.74795535, 0.72431497, 0.68437705, 0.38188765],
       [0.03766939, 0.04722106, 0.10763537, 0.23509044, 0.62450003]])

In [23]:
# predict_and_find_accuracy('kNN', train_X.values, train_y.values, test_X.values, test_y.values, k=10)

0.5303020017024405

In [24]:
"""
    Apply logistic regression for each rating to create 5 classifiers.
    Predict based on the best probability
"""

def logf(a, b, X):
    return 1.0 / (1.0 + np.exp(-a * X - b))

def dlogf(a, b, X):
    return logf(a, b, X) * (1 - logf(a, b, X))
##
## The derivative of the logistic function is f * (1 - f)
##
def one_step(X, y, a, b, eta):
    ycalc = logf(a, b, X)
    delta_a = sum((y - ycalc) * ycalc * (1 - ycalc) * X)
    delta_b = sum((y - ycalc) * ycalc * (1 - ycalc))
    a = a + delta_a * eta
    b = b + delta_b * eta
    error = sum((y - ycalc)**2)
    return a, b, error

def train_logistic(train_X, train_y, a=1, b=1, eta=.001, iterations=100000):
    for times in range(iterations):
        a, b, error = one_step(train_X, train_y, a, b, eta)
        if times % 1000 == 0:
            eta = max(0.00001, eta * 0.99)
            if times % 5000 == 0:
                print(a, b, error)
                
    return a, b, error

In [25]:
def logistic_multi_classifier(train_X, train_y, iterations=100000):
    # Store a, b and error for each ratings in this
    accuracies = []

    for i, r in enumerate([1.0, 2.0, 3.0, 4.0, 5.0]):
        train_y_binary = np.where(train_y[3] == r, 1, 0)
        accuracies.append(train_logistic(train_X, train_y_binary, iterations=iterations))
    
    return accuracies

def get_logistic_predictions(reg, my_test_X):
    m = np.zeros(len(test_X))
    result = np.zeros(len(test_X))
    for i, r in enumerate(reg):
#         print('rating = %d, args = %s' % (i+1, r))
        result = np.vstack((result, logf(r[0], r[1], my_test_X).values))

    return np.argmax(result, axis=0)

In [26]:
# You can use a single feature namely, the difference between number of Positive & Negative words.
# Train with my_X
my_X = train_X[0]-train_X[1]
my_test_X = test_X[0] - test_X[1]

reg = logistic_multi_classifier(my_X, train_y, iterations=100000) ### TEMP TEMP TEMP DEBUGGING - set low iteration
predictions = get_logistic_predictions(reg, my_test_X)
predictions


-4.233857642544481 -4.864547058536086 124334.77417816465
-0.5810938434690696 -6.082953298031106 4188.104919498645
-0.5326519863067984 -3.096177055587078 3999.2814336671954
-0.5185367811398574 -3.057851397052744 3984.8456006951255
-0.5021622554950662 -3.019568072888305 3970.0487587564417
-0.48297313862488045 -2.981246169089002 3954.933123872618
-0.4602449678028369 -2.942915618755123 3939.533714465045
-0.43285169734729256 -2.9047963485957706 3923.870870338413
-0.39822806821335477 -2.8673372577218745 3907.9107210548436
-0.34391741413489696 -2.8317793589444844 3891.4280041825355
-0.3079618518800988 -2.826089938744539 3887.735106060809
-0.30796185187996156 -2.826089938744547 3887.735106060789
-0.3079618518799614 -2.8260899387445466 3887.735106060789
-0.3079618518799616 -2.8260899387445475 3887.735106060789
-0.3079618518799615 -2.826089938744547 3887.735106060789
-0.30796185187996145 -2.8260899387445466 3887.735106060789
-0.3079618518799616 -2.8260899387445475 3887.735106060789
-0.3079618518

array([5, 5, 2, ..., 5, 5, 5])