<a href="https://colab.research.google.com/github/karenbennis/Xy/blob/Data_ETL/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<br><br>**ETL**<br><br>

In [1]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

#Interact with SQL
#!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("uiewfhn").getOrCreate()


In [2]:
from pyspark import SparkFiles

In [26]:
#url = "https://raw.githubusercontent.com/karenbennis/Xy/Data_ETL/yelp.csv"
#spark.sparkContext.addFile(url)
#data_df = spark.read.csv(SparkFiles.get("yelp.csv"), sep=",", header=True)
#data_df.show()
import pandas as  pd

data_df=pd.read_csv('https://raw.githubusercontent.com/karenbennis/Xy/Data_ETL/yelp.csv')


In [27]:
def good(x):
    if x > 3:
        return 1
    if x < 3:
        return 0

data_df['class']=data_df['stars'].apply(good)


data_df=data_df.drop(['business_id','date','review_id','stars','type','user_id','cool','useful','funny'],axis=1)
data_df=data_df.dropna()

In [28]:
df = spark.createDataFrame(data_df)
df.show()

+--------------------+-----+
|                text|class|
+--------------------+-----+
|My wife took me h...|  1.0|
|I have no idea wh...|  1.0|
|love the gyro pla...|  1.0|
|Rosie, Dakota, an...|  1.0|
|General Manager S...|  1.0|
|Quiessence is, si...|  1.0|
|Drop what you're ...|  1.0|
|Luckily, I didn't...|  1.0|
|Definitely come f...|  1.0|
|Nobuo shows his u...|  1.0|
|The oldish man wh...|  1.0|
|Wonderful Vietnam...|  1.0|
|They have a limit...|  1.0|
|Good tattoo shop....|  1.0|
|I'm 2 weeks new t...|  1.0|
|Was it worth the ...|  0.0|
|okay this is the ...|  1.0|
|They've gotten be...|  1.0|
|This place should...|  1.0|
|first time my fri...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [29]:
# Import functions
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [31]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature
data_df = df.withColumn('length', length(df['text']))
data_df.show()

+--------------------+-----+------+
|                text|class|length|
+--------------------+-----+------+
|My wife took me h...|  1.0|   889|
|I have no idea wh...|  1.0|  1345|
|love the gyro pla...|  1.0|    76|
|Rosie, Dakota, an...|  1.0|   419|
|General Manager S...|  1.0|   469|
|Quiessence is, si...|  1.0|  2094|
|Drop what you're ...|  1.0|  1565|
|Luckily, I didn't...|  1.0|   274|
|Definitely come f...|  1.0|   349|
|Nobuo shows his u...|  1.0|   186|
|The oldish man wh...|  1.0|   298|
|Wonderful Vietnam...|  1.0|   321|
|They have a limit...|  1.0|   433|
|Good tattoo shop....|  1.0|   593|
|I'm 2 weeks new t...|  1.0|  1206|
|Was it worth the ...|  0.0|   705|
|okay this is the ...|  1.0|   363|
|They've gotten be...|  1.0|   726|
|This place should...|  1.0|   104|
|first time my fri...|  1.0|   148|
+--------------------+-----+------+
only showing top 20 rows



In [32]:
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='class',outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="token_text", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [33]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [34]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [35]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [36]:
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3], 21)

In [37]:
from pyspark.ml.classification import NaiveBayes
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [39]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(predictor.transform(testing))
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.815244


In [None]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://challenge.cde4fgpazxbm.ca-central-1.rds.amazonaws.com:5432/"
config = {"user":'postgres', 
          "password": '', 
          "driver":"org.postgresql.Driver"}

