#Setting Up Pipeline for Streamlit

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS

import pickle

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

#setting random seed for all items
np.random.seed(42)

In [2]:
#found code solution from stackoverflow: https://stackoverflow.com/questions/48376580/google-colab-how-to-read-data-from-my-google-drive

from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/Datasets/Capstone/

df = pd.read_csv('review_data_for_modeling.csv')

Mounted at /content/drive
/content/drive/My Drive/Datasets/Capstone


In [4]:
df.head(3)

Unnamed: 0,comments,location,host,hygeine,comfort,cost
0,dont like deposit 2weeks release card,0,0,0,0,0
1,location location location best spot city went...,1,0,0,0,0
2,hello experience nice gonzalo nora great lande...,0,0,1,1,0


In [19]:
#creating function to create and pickle pipelines
def pickle_pipelines (target):

  #prepare dataframes
  X = df['comments']
  y = df[target]

  #train test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

  #set up stopwords
  custom_stop = ['airbnb', 'air bnb', 't', 's', 'air', 'bnb', 'new', 'york', 'new york', 'stay', 'place', 'apartment', 'll', 've']
  stop_words = text.ENGLISH_STOP_WORDS.union(custom_stop).union(STOP_WORDS)

  #set up pipeline
  pipe = Pipeline([
    ('cv', CountVectorizer(stop_words = stop_words)),
    ('lr', LogisticRegression(solver='liblinear'))])
  
  #fit pipeline
  pipe.fit(X_train,y_train)

  #pickle pipeline
  with open('{}_pipe.pkl'.format(target), 'wb') as pickle_out:
    pickle_out = pickle.dump(pipe, pickle_out)

  print(f'{target} pickle completed')

  return

In [20]:
#run features through pickle function

features = ['location', 'host', 'hygeine', 'comfort', 'cost']

for x in features:
  pickle_pipelines(x)

location pickle completed
host pickle completed
hygeine pickle completed
comfort pickle completed
cost pickle completed


# Test Pickled Pipelines

In [21]:
with open('location_pipe.pkl', 'rb') as pickle_in:
    pipe = pickle.load(pickle_in)

In [23]:
pipe.predict(['great sam accommodating responds super fast messages definite plus train close looks exactly photos'])

array([1])