## Scrapping

In [2]:
! pip install python-craigslist



In [1]:
# Import Soup and create parser

from bs4 import BeautifulSoup
from craigslist import CraigslistForSale
import requests
import json
from time import sleep
from numpy import random
from IPython.display import clear_output

In [2]:
# finds the body of the text
def find_body_text(soup):
    body = soup.find("section", {"id": "postingbody"})
    body_text = ""
    try:
        body_text = body.find_all(text=True, recursive = False)[1]
    except:
        None
    return body_text

In [3]:
def find_title_text(soup):
    title = soup.find("span", {"id": "titletextonly"})
    title_text = title.text
    return title_text

In [4]:
# need to do a try expect for empty image set
def find_image_list(soup):
    image_links = []
    image = soup.find("div", {"id": "thumbs"})
    try:
        image_anchors = image.find_all("a", {"class": "thumb"})
        for i in image_anchors:
            image_links.append(i["href"])
    except:
        None
    return image_links

In [5]:
# structure
# id: {title: "Text"
#      description: "Text"
#      region: "location"  
#      images: ["url1", "url2", "url3"]}
listings = {}


# create the Craiglist object for tippecanoe furnature
cl_tp = CraigslistForSale(site='tippecanoe', category='fua')
print("Tippecanoe Furnature Listing Count: {}".format(cl_tp.get_results_approx_count()))

cl_indy = CraigslistForSale(site="indianapolis", category ='fua')
print("Indianapolis Furnature Listing Count: {}".format(cl_indy.get_results_approx_count()))


count = 0
# iterate through the results to pull the id and the url of each listing
for result in cl_tp.get_results(sort_by='newest', geotagged=True):
    URL = result['url']
    sleep(random.uniform(2,3))
    clear_output(wait=True)
    count+=1
    print("Pulling request {}: {}".format(count, URL))
    page = requests.get(URL)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    holder = {}
    holder['region'] = "tippecanoe"
    holder['title'] = find_title_text(soup)
    holder['description'] = find_body_text(soup)
    holder['images'] = find_image_list(soup)
    
    listings[result["id"]] = holder

Tippecanoe Furnature Listing Count: 184
Indianapolis Furnature Listing Count: 1545


KeyboardInterrupt: 

In [None]:
# iterate through the results for indy
count = 0

for result in cl_indy.get_results(sort_by='newest', geotagged=True):
    URL = result['url']
    sleep(random.uniform(2,3))
    page = requests.get(URL)
    soup = BeautifulSoup(page.text, 'html.parser')
    count += 1
    clear_output(wait=True)
    print("Pulling request {}: {}".format(count, URL))
    
    holder = {}
    holder['region'] = "indianoplis"
    holder['title'] = find_title_text(soup)
    holder['description'] = find_body_text(soup)
    holder['images'] = find_image_list(soup)
    
    listings[result["id"]] = holder  

In [None]:
json_object = json.dumps(listings, indent = 4)
with open("listings.json", "w") as outfile:
    outfile.write(json_object)

In [None]:
# convert the json to csv for submission
with open("listings.json") as f:
    file_content = f.read()

listings = json.loads(file_content)

In [None]:
# turn json into csv as per the requirements
# Only useful the first time, hard coded labels into the xlsx file

import csv

header = ["id", "region", "title", "description", "images", "label"]
fname = "listings.csv"
with open(fname, 'w', encoding = "UTF8", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(header)
    for i in listings:
        row=[]
        row.append(i)
        row.append(listings[i]["region"])
        row.append(listings[i]["title"])
        row.append(listings[i]["description"][1:])
        row.append(listings[i]["images"])
        writer.writerow(row)

## Text Classification

### Random Forest Algo

In [6]:
import pandas as pd
import html
import numpy as np
import string
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Austi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Austi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Austi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Austi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Austi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
# reading excel file that was scrapped from Craigslist
df = pd.read_excel('listings.xlsx')[:1299]
df = df.iloc[:1299, :-1]
def latin_utf8(text):
    try:
        return text.encode("latin1").decode("utf8")
    except:
        return text

for col in ['title','description']:
    # fix the php and html parsing issues
    df[col] = df[col].apply(lambda text: html.unescape(str(text)))
    # fix no space after period
    df[col] = df[col].apply(lambda text: re.sub(r'[\.]', " ", text))
    # fix the latin encoding issue for décor
    df[col] = df[col].apply(lambda text: latin_utf8(text))

    # remove and punctuation and replace with blank don't => dont, this will help fix some spelling and grammar issues
    df[col] = df[col].apply(lambda text: re.sub(r'[^\w\s]', "", text))
    
# combining title and description into a string
df['combined'] = df['title']+ ' ' +df['description']

#divide data into train and test set
train = df.sample(frac=0.9)
test = df.drop(train.index)
X_train = train['combined']
X_test = test['combined']
y_train = train['label']
y_test = test['label']

In [8]:
# X_train pre processing
X_train_token = [nltk.word_tokenize(str(l)) for l in X_train]
remove_list = list(string.punctuation)+stopwords.words('english')
X_train_token = [[word.lower() for word in review if word.lower() not in remove_list and word.isalpha()] for review in X_train_token]

lemmatizer = nltk.stem.WordNetLemmatizer()
X_train_lemm = []
for review in X_train_token:
    lemreview = []
    for token in review:
        lemm = lemmatizer.lemmatize(token)
        lemreview.append(lemm)
    X_train_lemm.append(lemreview)

vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1,2))
X_train_processed = []
for review in X_train_lemm:
    review = ' '.join(review)
    X_train_processed.append(review)

X_train_tfidf = vectorizer.fit_transform(X_train_processed).toarray()

# X_test pre processing
X_test_token = [nltk.word_tokenize(str(l)) for l in X_test]
X_test_token = [[word.lower() for word in review if word.lower() not in remove_list and word.isalpha()] for review in X_test_token]

lemmatizer = nltk.stem.WordNetLemmatizer()
X_test_lemm = []
for review in X_test_token:
    lemreview = []
    for token in review:
        lemm = lemmatizer.lemmatize(token)
        lemreview.append(lemm)
    X_test_lemm.append(lemreview)

X_test_processed = []
for review in X_test_lemm:
    review = ' '.join(review)
    X_test_processed.append(review)

X_test_tfidf = vectorizer.transform(X_test_processed).toarray()

### Random Forest Test

In [9]:
# Random Forest Test
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train_tfidf,y_train)
y_test_pred_rf = rf.predict(X_test_tfidf)
y_train_pred_rf = rf.predict(X_train_tfidf)
score_train_rf = accuracy_score(y_train,y_train_pred_rf)
score_test_rf = accuracy_score(y_test, y_test_pred_rf)
print("Train Score: ",score_train_rf)
print("Test Score: ",score_test_rf)

Train Score:  0.9948674080410608
Test Score:  0.8


### Dataframe with ID, Title_Description, Predicted Label, Probability

In [10]:
proba_mat = pd.DataFrame(rf.predict_proba(X_test_tfidf)).apply(lambda x: x.max(),axis=1)
labels = pd.Series(y_test_pred_rf)
combined = pd.Series(X_test).reset_index().drop('index',axis=1)
df_proba = pd.concat([test['id'].reset_index().drop('index',axis=1),combined,labels,proba_mat],axis=1)
df_proba.columns = ['id','title_description','label','proba']
df_proba

Unnamed: 0,id,title_description,label,proba
0,7565015560,Entertainment Center and Hall Tree The Custom ...,entertainment,0.679405
1,7564716075,Love seat w recliner This faux leather love se...,seating,0.890000
2,7562281392,FullDouble Mattress Never Slept In 11 inch Co...,sleeping,0.680000
3,7561155451,Hanging Fruit Hanging glass fruit Never been used,storage,0.410000
4,7551850196,Benchpadded Sturdy bench,storage,0.300000
...,...,...,...,...
125,7556947693,bedroom dresser and mirror Arbek oak dresser a...,storage,0.840000
126,7556847085,Granite Topped Bombay End Table Nightstand Gra...,table,0.660000
127,7556733354,Pottery Barn Corner Desk Large white corner de...,table,0.695000
128,7556246473,Modern sectional sofa Modern gray sectional so...,seating,0.900000


## Exception Model

In [11]:
import numpy as np 
import pandas as pd
import os
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Flatten, GlobalAveragePooling2D, Activation, Flatten, Dropout
from tensorflow.python.keras import optimizers, regularizers
from tensorflow.keras.applications.xception import Xception
from keras.applications.xception import preprocess_input
from tensorflow.python.keras.models import Model
from tensorflow.keras.optimizers import SGD
from keras_preprocessing.image import ImageDataGenerator
from keras.preprocessing import image
import glob
from datasets.utils.file_utils import get_datasets_user_agent
import requests
from ast import literal_eval

import io
from PIL import Image
import urllib

### Downloading the images (Only run once)

In [12]:
# load the xlsx
# downloading the images to the right directory from craiglist, only the first image, based on label

#df_load = pd.read_excel(r'listings.xlsx')

#df = df_load.iloc[:1299, :-1]


df["images"] = df["images"].apply(literal_eval)
path = "Data/train/"

for index, row in df.iterrows():
    # list of images from listing
    image_list = row["images"]
    
    #pull the first image in the list
    try:
        image_url = image_list[0]
    except:
        continue
    #set to listing id
    image_id = row["id"]
    label = row["label"]
    image_path = path + "cl_" + label + "/" + str(image_id) +".jpg"
    USER_AGENT = get_datasets_user_agent()
    
    #if the image_url is empty this will break

    request = urllib.request.Request(
        image_url,
        data=None,
        headers={"user-agent": USER_AGENT},
    )

    if not os.path.isfile(image_path):
        with urllib.request.urlopen(request, timeout=None) as req:
            image = Image.open(io.BytesIO(req.read()))
            image.save(image_path)

### Creating the data to feed to the NN

In [13]:
# make a mapping of the cl_id, filepath and label
mappings = pd.DataFrame(columns = ["id","filepath","label"], index=[""]).dropna()

In [14]:
mappings.shape

(0, 3)

In [15]:
### maps every image path to a label for training

path = "Data/train"
# get all folders in train set
with os.scandir(path) as entries:
    for entry in entries:
        directory = entry.name
        label = ""
        cl = False
        cl_id = -1
        
        if directory == "bed":
            label = "sleeping"
        elif directory in ["sofa", "chair", "swivelchair"]:
            label = "seating"
        elif directory == "table":
            label = "table"
        elif directory[:3] == "cl_":
            label = directory[3:]
            cl = True
        else:
            continue
            
        
        #get all images in a specific folder
        directory_path = path +"/"+entry.name
        with os.scandir(directory_path) as files:
            for file in files:
                img_path = directory_path + "/" + file.name
                
                #This is not a good way to do this and should be refactored
                if cl:
                    cl_id = file.name[:-4]
                    
                mappings.loc[len(mappings.index)] = [cl_id, img_path, label]

In [16]:
# Match the test set for the model

In [17]:
#test_df=pd.DataFrame(columns = ["id","filepath","true_label", "prediction", "proba"], index=[""]).dropna()

# for some reason cant merge unless both str
test = test.astype({'id':'str'})
mappings = mappings.astype({'id': 'str'})


# expect some NaN for filepath as the not all have pictures
test_df = pd.merge(test, mappings, on='id', how='left')
test_df = test_df.rename(columns={"label_x": "label", "label_y": "prediction_NN"})
test_df["prediction_NN"] = ""

In [18]:
mappings.shape

(7439, 3)

In [19]:
train_val_df = mappings

for index, row in test_df.iterrows():
    cl_id = row['id']
    train_val_df.drop(train_val_df[train_val_df['id']==cl_id].index, inplace=True)

In [20]:
train_val_df.shape

(7347, 3)

### Training RF

In [None]:
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import numpy as np
import cv2

### Training NN

In [23]:
# instantiating the model

model = Sequential()
model.add(Xception(include_top=False, pooling='avg', weights="imagenet"))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.8))
model.add(Dense(6, activation='softmax'))
model.layers[0].trainable = False
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

In [22]:
image_size = 299
BATCH_TRAIN = 16
BATCH_VAL = 16
datagen = ImageDataGenerator(preprocessing_function=preprocess_input, 
                             rescale=1./255.,
                             horizontal_flip=True,
                             width_shift_range = 0.2,
                             height_shift_range = 0.2,
                             validation_split=0.15)

# removed the directory=r"./furniture_images/", arg
train_generator=datagen.flow_from_dataframe(dataframe=train_val_df,
                        x_col="filepath",
                        y_col="label",has_ext=False,
                        subset="training",batch_size=BATCH_TRAIN,
                        shuffle=True,
                        class_mode="categorical",
                        target_size=(image_size, image_size))

# removed the directory=r"./furniture_images", arg
valid_generator=datagen.flow_from_dataframe(
                        dataframe=train_val_df,
                        x_col="filepath",
                        y_col="label",has_ext=False,
                        subset="validation",batch_size=BATCH_VAL,
                        class_mode="categorical",target_size=(image_size, image_size))



Found 6245 validated image filenames belonging to 6 classes.
Found 1102 validated image filenames belonging to 6 classes.


#### Verify GPU CUDA is detected

In [36]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17118025501331182959
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5762973696
locality {
  bus_id: 1
  links {
  }
}
incarnation: 18147099931162551604
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3060 Ti, pci bus id: 0000:08:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


### Train the model (skip and use loading)

In [37]:
steps_per_train = int(np.ceil(train_generator.n / BATCH_TRAIN))
steps_per_val = int(np.ceil(train_generator.n / BATCH_VAL))

model.fit(train_generator,
          steps_per_epoch=steps_per_train,
          validation_data=valid_generator,
          validation_steps=steps_per_val,
          epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: 

In [25]:
### uncomment based on what you are doing

# save the model
# model.save('trained1_model')

# load the model
# from tensorflow import keras
# model = keras.models.load_model('path/to/location')



INFO:tensorflow:Assets written to: trained_model\assets


INFO:tensorflow:Assets written to: trained_model\assets


### Prediction

In [24]:
from PIL import Image
import numpy as np
from skimage import transform
from tensorflow.keras.utils import load_img,img_to_array

# def load(filename):
#     np_image = Image.open(filename)
#     np_image = np.array(np_image).astype('float32')/255
#     np_image = transform.resize(np_image, (299, 299, 3))
#     np_image = np.expand_dims(np_image, axis=0)
#     return np_image

def load(filename):
    img = load_img(filename, target_size=(299,299))
    #convert image to array
    input_img = img_to_array(img)
    input_img = np.expand_dims(input_img, axis=0)
    input_img = preprocess_input(input_img)
    return input_img

# get the list of labels and there indexs
labels = (train_generator.class_indices)
labels = list(labels.keys())
print(labels)

correct = 0
total = 0
    
for index, row in test_df.iterrows():
    try:
        img = load(row["filepath"])
        pred = model.predict(img)
        pred = list(pred[0])
        prob = max(pred)
        lab = labels[pred.index(prob)]
        test_df.loc[test_df.index == index, "prediction_NN"] = lab
        test_df.loc[test_df.index == index, "proba_NN"] = prob
        total += 1
        if lab == row["label"]:
            correct += 1
    except:
        pass

print(correct/total)

['entertainment', 'other', 'seating', 'sleeping', 'storage', 'table']
0.35555555555555557


## Ensemble

In [26]:
# Cleaning up
df_proba = df_proba.astype({'id':'str'})
test_df = test_df.astype({'id': 'str'})
test_df = test_df.drop(["description", "region", "combined"], axis = 1)

df_proba = df_proba.rename(columns={"label": "prediction_rf", "proba": "proba_rf"})

In [28]:
# expect some NaN for filepath as the not all have pictures
pred_df = pd.merge(df_proba, test_df, on='id', how='left')
#pred_df = test_df.rename(columns={"label_x": "label", "label_y": "prediction"})


# need to check the probabilities and then assign values

pred_df.head()

Unnamed: 0,id,title_description,prediction_rf,proba_rf,title,images,label,filepath,prediction_NN,proba_NN
0,7565015560,Entertainment Center and Hall Tree The Custom ...,entertainment,0.679405,Entertainment Center and Hall Tree,['https://images.craigslist.org/00S0S_fi2XJS06...,other,Data/train/cl_other/7565015560.jpg,entertainment,0.200292
1,7564716075,Love seat w recliner This faux leather love se...,seating,0.89,Love seat w recliner,['https://images.craigslist.org/01515_1WBeEG4z...,seating,Data/train/cl_seating/7564716075.jpg,seating,0.208093
2,7562281392,FullDouble Mattress Never Slept In 11 inch Co...,sleeping,0.68,FullDouble Mattress Never Slept In,['https://images.craigslist.org/00h0h_e1OdRheg...,sleeping,Data/train/cl_sleeping/7562281392.jpg,seating,0.211753
3,7561155451,Hanging Fruit Hanging glass fruit Never been used,storage,0.41,Hanging Fruit,['https://images.craigslist.org/00U0U_7JnutRYK...,other,Data/train/cl_other/7561155451.jpg,other,0.252546
4,7551850196,Benchpadded Sturdy bench,storage,0.3,Benchpadded,['https://images.craigslist.org/01414_75BmImkG...,seating,Data/train/cl_seating/7551850196.jpg,seating,0.205198


In [29]:
pred_df.shape

(130, 10)

In [30]:
#take which ever model is more confident

for index, row in pred_df.iterrows():
    if row["proba_rf"] > row["proba_NN"] or (np.isnan(row["proba_NN"])):
        pred_df.loc[pred_df.index == index, "prediction_EN"] = row["prediction_rf"]
        pred_df.loc[pred_df.index == index, "proba_EN"] = row["proba_rf"]
    else:
        pred_df.loc[pred_df.index == index, "prediction_EN"] = row["prediction_NN"]
        pred_df.loc[pred_df.index == index, "proba_EN"] = row["proba_NN"]

In [31]:
final_total = 0
final_correct = 0
for index, row in pred_df.iterrows():
    if row["prediction_EN"] == row["label"]:
        final_correct +=1
    final_total += 1
print(final_correct/final_total)

0.8
