In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Carpet and Hardwood").config("spark.some.config.option", "some-value").getOrCreate()
carpet = spark.read.csv("../data/carpet.csv")
hardwood = spark.read.csv("../data/hardwood.csv")

**Statistical Information**

In [3]:
n_carpet = len(carpet.columns) * carpet.count()
print("Number of observations in carpet.csv:", n_carpet)
n_hardwood = len(hardwood.columns) * hardwood.count()
print("Number of observations in hardwood.csv:", n_hardwood)

Number of observations in carpet.csv: 65536
Number of observations in hardwood.csv: 65536


In [4]:
for col in carpet.columns:
    carpet.describe([col]).show()

+-------+------------------+
|summary|               _c0|
+-------+------------------+
|  count|              1024|
|   mean| 151.8504160156249|
| stddev|22.671127881603628|
|    min|            100.21|
|    max|            99.996|
+-------+------------------+

+-------+------------------+
|summary|               _c1|
+-------+------------------+
|  count|              1024|
|   mean|151.28129980468736|
| stddev|22.043465914193835|
|    min|            100.01|
|    max|            99.782|
+-------+------------------+

+-------+------------------+
|summary|               _c2|
+-------+------------------+
|  count|              1024|
|   mean|151.28350976562498|
| stddev| 21.64234771849469|
|    min|            100.21|
|    max|            99.004|
+-------+------------------+

+-------+------------------+
|summary|               _c3|
+-------+------------------+
|  count|              1024|
|   mean|151.91973242187484|
| stddev|21.715600769419606|
|    min|            101.02|
|    max|  

+-------+------------------+
|summary|              _c32|
+-------+------------------+
|  count|              1024|
|   mean|151.04717675781248|
| stddev|22.869231645204852|
|    min|            100.41|
|    max|            99.244|
+-------+------------------+

+-------+------------------+
|summary|              _c33|
+-------+------------------+
|  count|              1024|
|   mean|152.25547167968753|
| stddev|22.575592434496684|
|    min|            100.81|
|    max|            99.453|
+-------+------------------+

+-------+------------------+
|summary|              _c34|
+-------+------------------+
|  count|              1024|
|   mean|151.30748242187508|
| stddev|22.787499393991578|
|    min|            101.29|
|    max|            96.636|
+-------+------------------+

+-------+------------------+
|summary|              _c35|
+-------+------------------+
|  count|              1024|
|   mean|150.19197167968738|
| stddev|21.850658237895374|
|    min|            100.28|
|    max|  

In [9]:
carpet_df = carpet.toPandas()
hardwood_df = hardwood.toPandas()
carpet_sum_df = carpet.describe().toPandas().set_index('summary')
hardwood_sum_df = hardwood.describe().toPandas().set_index('summary')

In [10]:
import matplotlib.pyplot as plt

In [11]:
carpet_df

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,...,_c54,_c55,_c56,_c57,_c58,_c59,_c60,_c61,_c62,_c63
0,170.39,167.28,143.44,124.67,139.01,125.83,144.33,151.26,175.51,171.31,...,172.96,169.67,157.51,161.06,133.23,124.41,138.44,142.93,137.13,134.44
1,169.75,190.96,175.53,138.27,137.47,139.23,133.23,130.25,147.73,163.93,...,139.58,141.58,153.39,141,148.43,168.12,169.9,165.64,166.86,137.69
2,153.69,153.68,144.02,158.73,178.87,157.04,152.92,147.52,142.87,165.26,...,155.19,170.51,155.37,167.11,146.89,141.01,159.43,169.68,163.24,165.17
3,131.69,151.56,151.05,134,151.18,175.53,171.34,159.77,151.95,146.1,...,164.25,155.82,157.83,152.43,150.82,146.58,128.85,140.76,177.35,174.61
4,162.85,158.88,132.27,138.41,143.98,159.3,177.26,180.58,159.34,164.66,...,132.8,130.96,135.74,167.31,188.21,179.52,146.2,153.73,152.12,146.58
5,132.05,149.12,165.08,170.62,162.19,157.1,145.86,149.52,162.84,149.5,...,147.36,140.65,142.59,137.61,157.63,152.16,140.43,142.32,142.06,154.87
6,153.59,142.25,157.33,156.08,149.33,162.97,150.25,146.47,145.99,137.82,...,159.71,149.75,174.9,154.72,162.83,162.37,162.75,168.6,170.81,168.1
7,167.68,153.49,149.19,148.71,166.03,167.04,153.06,157.48,133.57,143.66,...,150.72,154.01,145.76,147.58,145.01,164.87,157.2,147.07,162.98,167.99
8,136.48,130.02,131.72,152.04,163.03,172.93,170.11,165.2,166.41,120.67,...,145.14,162,142.04,121.19,124.21,145.11,142.15,148.38,142.86,154.15
9,145.96,140.31,126.34,113.12,118.66,140.33,139.9,139.51,168.7,149.54,...,171.77,152.2,121.89,125.42,146.03,142.27,127.53,141.77,160.67,159.55


In [12]:
carpet_sum_df

Unnamed: 0_level_0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,...,_c54,_c55,_c56,_c57,_c58,_c59,_c60,_c61,_c62,_c63
summary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
count,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,...,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0,1024.0
mean,151.8504160156249,151.28129980468736,151.28350976562498,151.91973242187484,152.5990107421876,152.82919921874984,152.29369042968744,151.9605263671876,152.1237177734376,152.248083984375,...,152.4150361328128,152.17765234375003,151.54137695312497,150.68695800781245,150.89364746093747,151.4153955078125,151.73828613281248,152.11242285156277,152.0208759765624,152.20948925781252
stddev,22.671127881603628,22.043465914193835,21.64234771849469,21.71560076941961,22.46717973237768,22.33678920082103,22.02894909046214,22.66004905036816,22.858321647979697,22.513211154249717,...,22.928062101193632,22.64336874792229,22.319997056379076,22.579196776340545,22.69918024425609,23.34175219782217,22.73706026105661,22.50129676658248,22.58383897718583,22.56167295762885
min,100.21,100.01,100.21,101.02,100.37,100.37,100.22,100.12,101.58,100.0,...,101.62,100.34,100.04,100.18,100.35,100.42,101.18,100.31,100.9,101.87
max,99.996,99.782,99.004,99.792,99.81,97.818,99.909,99.158,99.731,99.406,...,99.633,99.932,99.781,99.489,99.038,99.645,99.852,98.518,99.208,99.17
