In [153]:
import pandas as pd
import numpy as np
import re

In [2]:
from sklearn.metrics import mean_squared_error
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Methods

In [70]:
# Stems words to their root words and removes all characters that are not alphabets
def stem_str_item_description(str):
    ret_str = ""
    for w in word_tokenize(str.lower()):
        if w not in stop_words and w.isalpha() and len(w) > 1:
            ret_str = ret_str + " " + ps.stem(w)
    ret_str = re.sub("[^a-zA-Z]", " ", ret_str)
    return ret_str.strip()

In [13]:
# Stems words to their root words and removes all characters that are not alphabets
def clean_str_brand_name(str):
    ret_str = ""
    for w in str.lower():
        if w.isalnum() and len(w) > 0:
            ret_str = ret_str + w
#     ret_str = re.sub("[^a-zA-Z]", "", ret_str)
    return ret_str.strip()

In [57]:
def impute_category_name(row):
    if pd.isnull(row['category_name'])== True:
        if row['clean_brand_name'] in brand_cat_dict.keys():
            return brand_cat_dict[row['clean_brand_name']]
        else:
            return "No category name"
    else:
        return row['category_name']

In [145]:
# code copied from https://www.kaggle.com/marknagelberg/rmsle-function
def rmsle(y_pred, y_test) : 
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))

In [252]:
# Gets the count of most frequent words give a dataframe
def word_freq(df, col):
    word_frequency = {}
    word_frequency_lst = []
    for index,row in df.iterrows(): 
        for w in list(set(row[col].split(' '))):
            if w not in word_frequency:
                word_frequency[w] = 1
            else:
                word_frequency[w] += 1

    for key, value in word_frequency.items():
        temp = [key, value]
        word_frequency_lst.append(temp)
    word_freq_df = pd.DataFrame(word_frequency_lst, columns=["unique_word", 'frequency'])
    word_freq_df = word_freq_df.sort_values(['frequency'], ascending=False)
    return word_freq_df

# Read Data

In [2]:
unclean_train_data = pd.read_table('/Users/joashc/Downloads/mercari-price-suggestion-challenge/train.tsv')
unclean_train_data.shape

  """Entry point for launching an IPython kernel.


(1482535, 8)

# Clean Data

### Removing Unwanted Columns and Duplicate Rows

In [3]:
unclean_train_data_v2 = unclean_train_data.drop(columns='train_id')

In [4]:
unclean_train_data_v3 = unclean_train_data_v2.drop_duplicates()
unclean_train_data_v3.shape

(1482486, 7)

### Dealing with Nulls

In [5]:
unclean_train_data_v3.isnull().sum()

name                      0
item_condition_id         0
category_name          6327
brand_name           632641
price                     0
shipping                  0
item_description          4
dtype: int64

#### Brand Name Nulls

In [6]:
brand_value_counts = (pd.DataFrame(unclean_train_data_v3.brand_name.value_counts())
    .reset_index()
    .rename(columns={'index':'brand',
                    'brand_name':'count_rows'}))

print('Number of brands:', brand_value_counts.shape[0]-1)
brand_value_counts.head()

Number of brands: 4808


Unnamed: 0,brand,count_rows
0,PINK,54088
1,Nike,54043
2,Victoria's Secret,48035
3,LuLaRoe,31024
4,Apple,17322


In [7]:
unclean_train_data_v3['brand_name'] = unclean_train_data_v3['brand_name'].fillna("nobrandname")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
unclean_train_data_v3.isnull().sum()

name                    0
item_condition_id       0
category_name        6327
brand_name              0
price                   0
shipping                0
item_description        4
dtype: int64

#### Item Description Nulls

In [9]:
print('Approx', unclean_train_data_v3.item_description.isnull().sum()/unclean_train_data_v3.shape[0], 
      '% of the dataset contains item_description that are null.')
print()
print('I am not deleting these rows because if the test data contains nulls in the item_description,',
     'I want to impute those values.')

Approx 2.698170505488753e-06 % of the dataset contains item_description that are null.

I am not deleting these rows because if the test data contains nulls in the item_description, I want to impute those values.


In [10]:
print('Unique item descriptions:', unclean_train_data_v3.item_description.nunique())

Unique item descriptions: 1281426


In [11]:
pd.DataFrame(unclean_train_data_v3.item_description.value_counts()).head()

Unnamed: 0,item_description
No description yet,82489
New,4099
Brand new,3058
Good condition,1274
Great condition,1158


Fill null item descriptions with "No description yet" as there are 82489 items that currently do not have any descriptions

In [12]:
unclean_train_data_v3['item_description'] = unclean_train_data_v3['item_description'].fillna('No description yet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [13]:
unclean_train_data_v3.isnull().sum()

name                    0
item_condition_id       0
category_name        6327
brand_name              0
price                   0
shipping                0
item_description        0
dtype: int64

#### category_name nulls

In [14]:
print('Approx', round(unclean_train_data_v3.category_name.isnull().sum()/unclean_train_data_v3.shape[0], 3), 
      '% of the dataset contains category_name that are null.')
print()
print('I am not deleting these rows because if the test data contains nulls in the category_name,',
     'I want to impute those values.')

Approx 0.004 % of the dataset contains category_name that are null.

I am not deleting these rows because if the test data contains nulls in the category_name, I want to impute those values.


In [63]:
print("There are", 
     unclean_train_data_v3[(unclean_train_data_v3['item_description']!='No description yet') & 
                     (unclean_train_data_v3['brand_name']!='nobrandname') & 
                          (unclean_train_data_v3['category_name'].isnull()==True)].shape[0],
     "null values for category_name that have an item_description and brand_name")
print()
print("There are", 
     unclean_train_data_v3[(unclean_train_data_v3['item_description']=='No description yet') & 
                     (unclean_train_data_v3['brand_name']!='nobrandname') & 
                          (unclean_train_data_v3['category_name'].isnull()==True)].shape[0],
     "null values for category_name that have an brand_name but NO item_description")
print()
print("There are", 
     unclean_train_data_v3[(unclean_train_data_v3['item_description']!='No description yet') & 
                     (unclean_train_data_v3['brand_name']=='nobrandname') & 
                          (unclean_train_data_v3['category_name'].isnull()== True)].shape[0],
     "null values for category_name that have a item_description but NO brand_name")
print()
print("There are", 
     unclean_train_data_v3[(unclean_train_data_v3['item_description']=='No description yet') & 
                     (unclean_train_data_v3['brand_name']=='nobrandname') & 
                          (unclean_train_data_v3['category_name'].isnull()== True)].shape[0],
     "null values for category_name that have NO brand_name and NO item_description")

There are 2676 null values for category_name that have an item_description and brand_name

There are 194 null values for category_name that have an brand_name but NO item_description

There are 3029 null values for category_name that have a item_description but NO brand_name

There are 0 null values for category_name that have NO brand_name and NO item_description


In [16]:
pd.DataFrame(unclean_train_data_v3.category_name.value_counts()).head()

Unnamed: 0,category_name
"Women/Athletic Apparel/Pants, Tights, Leggings",60176
Women/Tops & Blouses/T-Shirts,46380
Beauty/Makeup/Face,34331
Beauty/Makeup/Lips,29908
Electronics/Video Games & Consoles/Games,26557


In [19]:
unclean_train_data_v3['category_name'] = np.where((unclean_train_data_v3['item_description']=='No description yet') &
                                                  (unclean_train_data_v3['brand_name']=='nobrandname') &
                                                   (unclean_train_data_v3['category_name'].isnull()== True),
                                                  "No category",
                                                  unclean_train_data_v3['category_name']
)

print(unclean_train_data_v3['category_name'].isnull().sum(), 'null values for category_name left!')

5899 null values for category_name left!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


For imputing the rest of the nulls for category_name, I want to use the item_description and brand_name columns. I first have to clean these columns to deal with the rest of the nulls.

### Initial Text Cleaning
- brand_name
    - Make everything lower case
    - Take out spaces
    - Take out special characters
- item_description
    - Make everything lower case
    - Take out special characters and numeric characters
    - Tokenize
    - Removal of stopwords

In [38]:
%%time
unclean_train_data_v3['stemmed_item_description'] = unclean_train_data_v3['item_description'].apply(
    lambda x: stem_str_item_description(str(x)))

CPU times: user 13min 29s, sys: 4.17 s, total: 13min 34s
Wall time: 13min 57s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [62]:
# replace nulls after cleaning with default talenized value
unclean_train_data_v3['stemmed_item_description'] = np.where(
    unclean_train_data_v3['stemmed_item_description'].isnull()==True,
'descript yet', unclean_train_data_v3['stemmed_item_description'])

To get from decriptions
 - oz
 - number of items
 - size (S, M, L, XL)

In [14]:
%%time
unclean_train_data_v3['clean_brand_name'] = unclean_train_data_v3['brand_name'].apply(
    lambda x: clean_str_brand_name(str(x)))

CPU times: user 2.82 s, sys: 47.8 ms, total: 2.87 s
Wall time: 2.88 s


In [54]:
unclean_train_data_v3.isnull().sum()

name                           0
item_condition_id              0
category_name               5899
brand_name                     0
price                          0
shipping                       0
item_description               0
stemmed_item_description    3320
clean_brand_name               0
dtype: int64

### Dealing with category_name Nulls

In [55]:
unique_brand_names_w_na_category = (list(unclean_train_data_v3[
    (unclean_train_data_v3['category_name'].isnull()== True)].clean_brand_name.unique()))

print(len(unique_brand_names_w_na_category), 'unique brand names with null category names')

606 unique brand names with null category names


In [56]:
%%time
brand_cat_dict = {}
for brand in unique_brand_names_w_na_category:
    subset_brand_data = unclean_train_data_v3[(unclean_train_data_v3['clean_brand_name']==brand) & 
                                             (unclean_train_data_v3['category_name'].isnull()==False)]
    if subset_brand_data.shape[0] > 1:
        # can only perform value counts of shape of dataset is greater than 1
        brand_cat_dict[brand] = pd.DataFrame(subset_brand_data.category_name.value_counts()).head(1).index[0]
print('Found most popular categories for', len(brand_cat_dict), 'brand names.',
     'For the rest of the brand names where we could not find the most popular category, we will,'
     ' impute no categpry for now.')

for brand in list(set(unique_brand_names_w_na_category) - set(brand_cat_dict.keys())):
    brand_cat_dict[brand] = "No category"

Found most popular categories for 579 brand names. For the rest of the brand names where we could not find the most popular category, we will, impute no categpry for now.
CPU times: user 51 s, sys: 144 ms, total: 51.1 s
Wall time: 51.3 s


In [33]:
test_df = unclean_train_data_v3[unclean_train_data_v3['category_name'].isnull()==True]

In [58]:
unclean_train_data_v3['category_name'] = unclean_train_data_v3.apply(impute_category_name, axis=1)

In [63]:
unclean_train_data_v3.isnull().sum()

name                        0
item_condition_id           0
category_name               0
brand_name                  0
price                       0
shipping                    0
item_description            0
stemmed_item_description    0
clean_brand_name            0
dtype: int64

### Clean category_name Column

In [67]:
# replace / with a space
unclean_train_data_v3['category_name'] = unclean_train_data_v3['category_name'].str.replace('/',' ')

In [71]:
%%time
unclean_train_data_v3['clean_category_name'] = unclean_train_data_v3['category_name'].apply(
    lambda x: stem_str_item_description(str(x)))

CPU times: user 3min 48s, sys: 1.28 s, total: 3min 49s
Wall time: 3min 52s


In [74]:
unclean_train_data_v3['clean_category_name'].isnull().sum()

0

### Clean name Column

In [80]:
# replace / with a space
unclean_train_data_v3['name'] = unclean_train_data_v3['name'].str.replace('/',' ')

In [81]:
%%time
unclean_train_data_v3['clean_item_name'] = unclean_train_data_v3['name'].apply(
    lambda x: stem_str_item_description(str(x)))

CPU times: user 3min 31s, sys: 752 ms, total: 3min 32s
Wall time: 3min 34s


In [82]:
unclean_train_data_v3['clean_item_name'].isnull().sum()

0

In [85]:
# unclean_train_data_v3.to_csv(
#     '/Users/joashc/Downloads/mercari-price-suggestion-challenge/partially_clean_train_data.csv', 
#     index=False)

In [53]:
unclean_train_data_v3 = pd.read_csv(
    '/Users/joashc/Downloads/mercari-price-suggestion-challenge/partially_clean_train_data.csv')
unclean_train_data_v3.shape

(1482486, 9)

## item_condition

In [54]:
item_condition = pd.concat([pd.DataFrame(unclean_train_data_v3.item_condition_id.value_counts()),
           pd.DataFrame(unclean_train_data_v3.item_condition_id.value_counts(normalize=True))], 
          axis=1)

item_condition.columns = ['item_condition_count', 'item_condition_percent']
item_condition.head()

Unnamed: 0,item_condition_count,item_condition_percent
1,640501,0.432045
3,432161,0.291511
2,375478,0.253276
4,31962,0.02156
5,2384,0.001608


In [59]:
item_condition_agg = (unclean_train_data_v3
    .groupby('item_condition_id')
    .agg({'price':['min', 'median', 'mean', 'max']})
    .reset_index()
)

item_condition_agg

Unnamed: 0_level_0,item_condition_id,price,price,price,price
Unnamed: 0_level_1,Unnamed: 1_level_1,min,median,mean,max
0,1,0.0,18.0,26.488162,2009.0
1,2,0.0,17.0,27.563231,2004.0
2,3,0.0,16.0,26.540711,2000.0
3,4,0.0,15.0,24.349212,1309.0
4,5,0.0,19.0,31.703859,522.0


This aggregation of item condition and the price does not really tell us much as the dataset contains difference categories of items like clothing, jewelery, etc

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import csv
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
import pandas as pd
from sklearn.metrics import confusion_matrix
import winsound
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import time

if __name__ == '__main__':
    #Reading in the file via csv library
    filepath = 'C:\\Users\\Joash\\Desktop\\University Stuff\\4B uni stuff\\SYDE 522\\522 Project\\SMS_spam_or_ham' \
               '\\spam_result'
    csvfile = open(filepath + '.csv', "rt", encoding="utf8")
    reader = csv.reader(csvfile)
    sms_stemmed = []
    classification = []
    sms = []
    for row in reader:
        if len(row[2]) != 0:
            sms_stemmed.append(row[2])
            sms.append(row[1])
            if row[0] == "spam":
                classification.append(1)
            elif row[0] == "ham":
                classification.append(0)
    sms_stemmed = sms_stemmed[1:]
    sms = sms[1:]
    print(len(sms_stemmed), len(classification))

    print(len(sms_stemmed), len(classification))
    # sms_pd = pd.DataFrame(sms_stemmed)
    # sms_pd.to_csv('check.csv')
    random_state = 2
    pre_score = []
    acc_score = []
    scores = []
    predicted_classes = []
    test_pred = []

    X_tr, X_te, y_tr, y_te = train_test_split(sms_stemmed, classification, test_size=0.30, random_state=random_state)

    max_features = 1500
    tfidf = TfidfVectorizer(max_features=max_features)
    x_tfidf = tfidf.fit_transform(sms_stemmed).toarray()
    classification = np.asarray(classification)
    print(type(x_tfidf), x_tfidf.shape, classification.shape)

    # split into train and test
    X_train, X_test, y_train, y_test = train_test_split(x_tfidf, classification, test_size=0.30, random_state=random_state)
    print(len(X_test), len(y_test))


    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    count = 1

    for train, validate in kfold.split(X_train, y_train):
        print('This is', count, 'fold!')
        model = Sequential()
        model.add(Dense(60, input_shape=(max_features,)))
        model.add(Activation('relu'))
        # model.add(Dropout(0.2))
        model.add(Dense(5))
        model.add(Activation('relu'))
        # model.add(Dropout(0.2))
        model.add(Dense(1))
        model.add(Activation('sigmoid'))
        model.summary()
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
        model.fit(X_train[train], y_train[train], batch_size=64, epochs=10, verbose=1)

        test_pred = model.predict(X_train[validate])
        # print(test_pred)
        predicted_classes = np.around(test_pred, decimals=0)
        # Creating the Confusion Matrix
        cm = confusion_matrix(y_train[validate], predicted_classes)
        print(cm)
        accuracy = (cm[0, 0] + cm[1, 1]) / X_train[validate].shape[0]
        precision = cm[0, 0] / (cm[0, 0] + cm[1, 0])
        print('The accuracy and precision of the model is:', round(accuracy, 3), 'and', round(precision, 3))
        acc_score.append(accuracy)
        pre_score.append(precision)
        count += 1

    print('Model accuracy is', round(sum(acc_score) / len(acc_score),3))
    print('Model precision is', round(sum(pre_score) / len(pre_score),3))
    t1 = time.time()
    test_pred = model.predict(X_test)
    predicted_classes = np.around(test_pred, decimals=0)
    t2 = time.time()
    print(t2 - t1)
    cm = confusion_matrix(y_test, predicted_classes)
    print(cm)
    # accuracy = (cm[0, 0] + cm[1, 1]) / X_test.shape[0]
    accuracy = accuracy_score(y_test, predicted_classes)
    # precision = cm[0, 0] / (cm[0, 0] + cm[1, 0])
    precision = precision_score(y_test, predicted_classes)
    print('Test accuracy is', accuracy)
    print('Test precision is', precision)

    print(len(X_test), len(test_pred))
    test_output = []
    for i in range(0, len(X_test), 1):
        test_output.append([X_te[i], y_test[i], predicted_classes[i][0]])

    test_output_df = pd.DataFrame(test_output, columns=['sms_stemmed', 'Actual Classification', 'Predicted Classification'])
    test_output_df.to_csv('output.csv', index=False)

    frequency = 2500  # Set Frequency To 2500 Hertz
    duration = 500  # Set Duration To 1000 ms == 1 second
    winsound.Beep(frequency, duration)

In [36]:
test_df['no_nulls_category_name'] = test_df.apply(lambda x: impute_category_name(x), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# Additional Things you can do

- Replace / with a space
- Get stuff out of the description of category (size, oz, etc)
- standardizing the categories (Clothes, Jewelery, Other)
- get gender for each item (men, women, child, both, other(animal))
