In [5]:
# Importing python modules
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.sql import Row
from os.path import abspath 

In [6]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType,FloatType,DoubleType

In [7]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier,GBTClassifier, RandomForestClassificationModel
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
# warehouse_location =abspath('/predictive_maintenance/data/predicted_data/')

In [9]:
sc=SparkContext()

In [10]:
rfc_model = RandomForestClassificationModel.load("../model/rfc_model")

In [11]:
sqlcontext = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .enableHiveSupport() \
    .getOrCreate()

In [13]:
last_status_dict = {}

##### ******Start testing for 10 days data

In [14]:
data=['sensor_data_13112017.csv','sensor_data_14112017.csv','sensor_data_15112017.csv','sensor_data_16112017.csv','sensor_data_17112017.csv',
     'sensor_data_18112017.csv','sensor_data_19112017.csv','sensor_data_20112017.csv','sensor_data_21112017.csv','sensor_data_22112017.csv']

In [16]:
sqlcontext.sql("CREATE EXTERNAL TABLE IF NOT EXISTS agg_predicted_data (machine_id string,session_id int,avg_sound double,avg_temperature double,max_cum_dist double,prediction double) ROW FORMAT DELIMITED FIELDS TERMINATED BY',' LINES TERMINATED BY '\n' LOCATION 'hdfs://localhost:9000/predictive_maintenance/data/predicted_data' ")

DataFrame[]

In [17]:
for each in data:
    # Reading and preprocessing
    input_=sqlcontext.read.csv('../data/test/'+each,inferSchema=True,header=False)
    input_data = input_.withColumnRenamed("_c0", "machine_id").withColumnRenamed("_c1", "sensor_id").withColumnRenamed("_c2", "session_id")\
    .withColumnRenamed("_c3", "sess_time").withColumnRenamed("_c4", "cum_dist").withColumnRenamed("_c5", "temperature") \
    .withColumnRenamed("_c6", "sound")
    
    df_input=input_data.groupBy(['machine_id', 'session_id']).agg({'cum_dist':'max', 'temperature':'avg', 'sound':'avg'}).orderBy('machine_id','session_id')
    assembler = VectorAssembler(inputCols=['avg(sound)','avg(temperature)','max(cum_dist)'], outputCol='features')
    output = assembler.transform(df_input)
    
    test_out_df = rfc_model.transform(output)
    
    
    test_out_df=test_out_df.withColumnRenamed('avg(sound)','avg_sound').withColumnRenamed('avg(temperature)','avg_temperature').withColumnRenamed('max(cum_dist)','max_cum_dist')
    tempdf = test_out_df.select(['machine_id', 'session_id', 'avg_sound', 'avg_temperature','max_cum_dist', 'prediction'])
    
    
    tempdf.repartition(1).write.insertInto("agg_predicted_data")
    # Look up table
    for row in test_out_df.rdd.collect():
        if row.machine_id not in last_status_dict.keys():
            last_status_dict[row.machine_id] = row.prediction
        else:
            val=(last_status_dict[row.machine_id]*2+row.prediction)%4
            if last_status_dict[row.machine_id]==3 and row.prediction==1 :
                print("{0} is Abnormal".format(row.machine_id))
            last_status_dict[row.machine_id]=val
    print("------------------------------------------------------------")
    

------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
MID1 is Abnormal
MID10 is Abnormal
MID15 is Abnormal
MID20 is Abnormal
MID25 is Abnormal
MID30 is Abnormal
MID35 is Abnormal
MID40 is Abnormal
MID45 is Abnormal
MID5 is Abnormal
------------------------------------------------------------
MID1 is Abnormal
MID10 is Abnormal
MID15 is Abnormal
MID20 is Abnormal
MID25 is Abnormal
MID30 is Abnormal
MID35 is Abnormal
MID40 is Abnormal
MID45 is Abnormal
MID5 is Abnormal
------------------------------------------------------------
MID1 is Abnormal
MID10 is Abnormal
MID15 is Abnormal
MID20 is Abnormal
MID25 is Abnormal
MID30 is Abnormal
MID35 is Abnormal
MID40 is Abnormal
MID45 is Abnormal
MID5 is Abnormal
MID50 is Abnormal
MID55 is Abnormal
MID

In [19]:
last_status_dict

{'MID1': 3.0,
 'MID10': 3.0,
 'MID100': 0.0,
 'MID11': 0.0,
 'MID12': 0.0,
 'MID13': 0.0,
 'MID14': 0.0,
 'MID15': 3.0,
 'MID16': 0.0,
 'MID17': 0.0,
 'MID18': 0.0,
 'MID19': 0.0,
 'MID2': 0.0,
 'MID20': 3.0,
 'MID21': 3.0,
 'MID22': 0.0,
 'MID23': 0.0,
 'MID24': 0.0,
 'MID25': 3.0,
 'MID26': 0.0,
 'MID27': 0.0,
 'MID28': 0.0,
 'MID29': 0.0,
 'MID3': 0.0,
 'MID30': 3.0,
 'MID31': 0.0,
 'MID32': 0.0,
 'MID33': 0.0,
 'MID34': 0.0,
 'MID35': 3.0,
 'MID36': 0.0,
 'MID37': 0.0,
 'MID38': 0.0,
 'MID39': 0.0,
 'MID4': 0.0,
 'MID40': 3.0,
 'MID41': 0.0,
 'MID42': 3.0,
 'MID43': 0.0,
 'MID44': 0.0,
 'MID45': 3.0,
 'MID46': 0.0,
 'MID47': 0.0,
 'MID48': 0.0,
 'MID49': 0.0,
 'MID5': 3.0,
 'MID50': 3.0,
 'MID51': 0.0,
 'MID52': 0.0,
 'MID53': 0.0,
 'MID54': 0.0,
 'MID55': 3.0,
 'MID56': 0.0,
 'MID57': 0.0,
 'MID58': 0.0,
 'MID59': 0.0,
 'MID6': 0.0,
 'MID60': 3.0,
 'MID61': 0.0,
 'MID62': 0.0,
 'MID63': 0.0,
 'MID64': 0.0,
 'MID65': 3.0,
 'MID66': 0.0,
 'MID67': 0.0,
 'MID68': 0.0,
 'MID69': 0.0,


##### End testing for 10 days data

In [20]:
sqlcontext.sql("show tables").show()

+--------+------------------+-----------+
|database|         tableName|isTemporary|
+--------+------------------+-----------+
| default|          agg_data|      false|
| default|agg_predicted_data|      false|
| default|            lookup|      false|
| default|        sensordata|      false|
| default|         test_data|      false|
| default|          test_out|      false|
| default|        train_data|      false|
+--------+------------------+-----------+



In [21]:
sqlcontext.sql("select * from agg_predicted_data").show(1000)

+----------+----------+------------------+------------------+------------+----------+
|machine_id|session_id|         avg_sound|   avg_temperature|max_cum_dist|prediction|
+----------+----------+------------------+------------------+------------+----------+
|      MID1|       324| 7.110543148305303| 39.94734899093489|        3.78|       1.0|
|     MID10|       324| 7.099947725183335|39.946272455257784|       1.943|       1.0|
|    MID100|       324| 2.247957527359234|29.390243902439025|       4.533|       0.0|
|     MID11|       324| 2.233621897693422|29.458452722063036|       9.667|       0.0|
|     MID12|       324|2.2433689040148383| 29.62753036437247|       5.467|       0.0|
|     MID13|       324| 2.237400334433626| 29.46643109540636|        7.05|       0.0|
|     MID14|       324|2.2555931735456585| 29.31983805668016|       5.467|       0.0|
|     MID15|       324| 7.016598562372428| 39.85738832163087|       1.577|       1.0|
|     MID16|       324|2.2493855205673166| 29.45414847