This notebook aims to:
1. create the 3D array 
2. inserting tf-idf values into it
3. extracting eigen values 
4. creating and saving the feature matrix 

In [None]:
!pip install pyspark

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy.linalg import eig
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.types as t
import pyspark.sql.functions as f

In [None]:
spark = (SparkSession.builder.config("spark.driver.memory","4g").config("spark.driver.maxResultSize", "4g").getOrCreate())

In [None]:
df = spark.read.csv("../input/temperory/shingles_tf_idf.csv", inferSchema = True, header = True)

In [None]:
df.registerTempTable("df")

In [None]:
spark.sql('''
select distinct(second) from df
''').toPandas()

Creating the array

In [None]:
arr = np.zeros([20000, 26, 26], dtype=float)

In [None]:
for row in df.rdd.collect():
    if((row["first"] != "NA") & (row["second"] != "NA")):
        arr[ (row["sentence_name"]-1), (ord(row["first"]) - 97), (ord(row["second"]) - 97) ] = row["tf_idf"]

In [None]:
np.save('arr.npy', arr)

getting eigen values

In [None]:
x = []
for i in range(0,20000):
    w,v = eig(arr[i])
    x.append(w[:10].real)
x = np.array(x)

creating feature matrix

In [None]:
feature_matrix = pd.DataFrame(x, columns=["f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9"])

In [None]:
feature_matrix.to_csv("feature_matrix.csv", index=False)

In [None]:
feature_matrix["n"] = range(1, len(feature_matrix) + 1)

In [None]:
feature_matrix

In [None]:
train = pd.read_csv("../input/temperory/train.csv")

joining it with train data

In [None]:
final_data = feature_matrix.merge(train.rename(columns={"sentence_number": "n"}), on=["n"])[["f0", "f1", "f2", "f3","f4", "f5", "f6", "f7", "f8", "f9","toxic"]]


In [None]:
final_data

In [None]:
final_data.to_csv("fd_10_feat.csv", index = False)