In [1]:
import findspark
findspark.init('/home/kakade/spark')
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession,SQLContext
spark = SparkSession.builder.appName("kakade").getOrCreate()

In [2]:
from pyspark.sql.types import StructField,StringType,FloatType,StructType
#Add header to data feature
feature_schema = [StructField("country", StringType(), True), StructField("sku_id", StringType(), True), \
                  StructField("title",StringType(), True), StructField("category_1",StringType(), True), \
                  StructField("category_2",StringType(), True), StructField("category_3",StringType(), True), \
                  StructField("description",StringType(), True), StructField("org_price", FloatType(), True), \
                  StructField("product_type",StringType(), True)]
# country sku_id title category_1 category_2 category_3 short_description price product_type 
feature_struc = StructType(fields=feature_schema)

In [3]:
train_feature = spark.read.csv("../Data/training/data_train.csv",schema=feature_struc)

In [None]:
#most other category only contain <=5 products
#filter products out of the main categories
# train_feature.groupBy("category_1").count().orderBy("count",ascending=False).show(truncate=False)
main_category=train_feature.groupBy("category_1").count().filter("count>5")
train_feature = train_feature.join(main_category, col("train_feature.category_1") == col("main_category.category_1"), 'inner')

In [4]:
from bs4 import BeautifulSoup
from pyspark.sql.functions import udf
from pyspark.ml.feature import Tokenizer, RegexTokenizer,StopWordsRemover,CountVectorizer

In [5]:
def html_extract_func(col):
    soup=BeautifulSoup(col)
    return soup.get_text()
    return (1-col1/col2)**2
html_extract = udf(html_extract_func, StringType())
train_feature = train_feature.withColumn("html_extract",html_extract("description"))
# train_feature.select("html_extract").show(truncate=False)

In [7]:
import pyspark.sql.functions as F
des_cat_test = train_feature.select("category_1","sku_id",F.regexp_replace(train_feature.html_extract, '(\d+)', ' ').alias('html_extract'))
regexTokenizer = RegexTokenizer(inputCol="html_extract", outputCol="description_token", pattern="\\W")
des_cat_test = regexTokenizer.transform(des_cat_test)
stop_words_remover = StopWordsRemover(inputCol="description_token", outputCol="stop_words_filtered")
des_cat_test = stop_words_remover.transform(des_cat_test)
des_cat_test.select("stop_words_filtered").show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|stop_words_filtered                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+-------------------------------------------------------

In [13]:
##currency_exchange
#set all price to PHP
import pyspark.sql.functions as F
from forex_python.converter import CurrencyRates
cex = CurrencyRates()
S2P=cex.get_rate("SGD","PHP")
M2P=cex.get_rate("MYR","PHP")
train_feature=train_feature.withColumn("price", F.when(train_feature.country == "my", M2P*train_feature.org_price).when(train_feature.country == "sg", S2P*train_feature.org_price).otherwise(train_feature.org_price))
                                    

In [15]:
train_feature.select("price","org_price").show()

+------------------+---------+
|             price|org_price|
+------------------+---------+
|            597.31|     49.0|
|           1560.32|    128.0|
| 305.6032962799072|    25.07|
|1438.4199999999998|    118.0|
|1399.4120372009277|    114.8|
|31681.809999999998|   2599.0|
| 4741.787980957031|   388.99|
|126.77599534988403|     10.4|
|               0.0|      0.0|
|            304.75|     25.0|
|115.56119441986084|     9.48|
| 950.8199999999999|     78.0|
|189.55450232505797|    15.55|
|6082.8099999999995|    499.0|
|            1462.8|    120.0|
|              null|     null|
| 1031.273981399536|     84.6|
|             365.7|     30.0|
| 203.0853981399536|    16.66|
|2072.2999999999997|    170.0|
+------------------+---------+
only showing top 20 rows



In [4]:
train_feature.groupBy("category_1").count().orderBy("count",ascending=False).show(truncate=False)

+--------------------------------------------------------------+-----+
|category_1                                                    |count|
+--------------------------------------------------------------+-----+
|Mobiles & Tablets                                             |7273 |
|Home & Living                                                 |6042 |
|Fashion                                                       |5729 |
|Watches Sunglasses Jewellery                                  |4216 |
|Health & Beauty                                               |4040 |
|Computers & Laptops                                           |2882 |
|TV, Audio / Video, Gaming & Wearables                         |2505 |
|Cameras                                                       |1950 |
|Home Appliances                                               |1583 |
| 8GB                                                          |5    |
| 4GB                                                          |5    |
|A1466