In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [2]:
ss = SparkSession.builder.getOrCreate()

### DataFrame을 생성한다

In [3]:
df = ss.createDataFrame(
    data = [
        ('Jam', 5.62, '쥬스'),  
        ('Ham', 0.85, '우유')
    ]
    , schema = ["Name", "Amount", "Item"]
)

### DataFrame을 보여준다
```DataFrame.show()```

In [4]:
df.show()

+----+------+----+
|Name|Amount|Item|
+----+------+----+
| Jam|  5.62|쥬스|
| Ham|  0.85|우유|
+----+------+----+



### DataFrame의 스키마 정보를 보여준다
```DataFrame.printSchema()```

In [5]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Item: string (nullable = true)



### DataFrame의 요약 통계를 보여준다
```DataFrame.describe().show()```

In [6]:
df.describe().show()

+-------+----+-----------------+----+
|summary|Name|           Amount|Item|
+-------+----+-----------------+----+
|  count|   2|                2|   2|
|   mean|null|            3.235|null|
| stddev|null|3.372899346259832|null|
|    min| Ham|             0.85|우유|
|    max| Jam|             5.62|쥬스|
+-------+----+-----------------+----+



### Column name을 바꾼다

```DataFrame.withColumnRenamed('old_name', 'new_name')```

In [7]:
rename_df = df.withColumnRenamed("Amount", "Price")

In [8]:
rename_df.show()

+----+-----+----+
|Name|Price|Item|
+----+-----+----+
| Jam| 5.62|쥬스|
| Ham| 0.85|우유|
+----+-----+----+



In [9]:
import import_ipynb
from utils import rename

importing Jupyter notebook from utils.ipynb


In [10]:
rename_df = rename(df, 'Amount', 'Price')
rename_df.show()

+----+-----+----+
|Name|Price|Item|
+----+-----+----+
| Jam| 5.62|쥬스|
| Ham| 0.85|우유|
+----+-----+----+



In [11]:
rename_df = rename(df, ['Name', 'Amount', 'Item'], ['고객명', '가격', '상품'])
rename_df.show()

+------+----+----+
|고객명|가격|상품|
+------+----+----+
|   Jam|5.62|쥬스|
|   Ham|0.85|우유|
+------+----+----+



### Row 수를 확인한다

In [12]:
df.count()

2

Column 수 확인한다

In [13]:
len(df.columns)

3

### Column Type을 바꾼다

In [14]:
df_chnage_type = df.withColumn('Amount', df['Amount'].cast(StringType()))

df_chnage_type.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Amount: string (nullable = true)
 |-- Item: string (nullable = true)



### Column을 삭제한다

In [15]:
df.drop('Name').show()

+------+----+
|Amount|Item|
+------+----+
|  5.62|쥬스|
|  0.85|우유|
+------+----+



### .show() 와 .collect() 차이 
.show() 는 보여주기식이다. 이것을 다음 단계로 넘길 수 없다.

.collect() 는 list로 값을 넘겨준다. 다음 단계로 넘겨 분석을 진행할 수 있다.

In [16]:
df.describe(['Amount']).show()

+-------+-----------------+
|summary|           Amount|
+-------+-----------------+
|  count|                2|
|   mean|            3.235|
| stddev|3.372899346259832|
|    min|             0.85|
|    max|             5.62|
+-------+-----------------+



In [17]:
df.describe(['Amount']).collect()

[Row(summary='count', Amount='2'),
 Row(summary='mean', Amount='3.235'),
 Row(summary='stddev', Amount='3.372899346259832'),
 Row(summary='min', Amount='0.85'),
 Row(summary='max', Amount='5.62')]

### Column 명 확인

In [18]:
df.columns

['Name', 'Amount', 'Item']

### Schema 확인

In [19]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Item: string (nullable = true)



### Aggregate Functions
- avg
- count
- max
- mean
- min
- sum

In [20]:
df.agg(F.avg('Amount')).collect()

[Row(avg(Amount)=3.235)]

In [22]:
df.agg(F.count('Amount')).collect()

[Row(count(Amount)=2)]

In [23]:
df.agg(F.max('Amount')).collect()

[Row(max(Amount)=5.62)]

In [24]:
df.agg(F.min('Amount')).collect()

[Row(min(Amount)=0.85)]

In [28]:
df.agg(F.sum('Amount')).collect()

[Row(sum(Amount)=6.47)]

Qunatile(분위수) 계산

```df.approxQuantile('col', [probabilities], relativeError)```

- col: column name, or a list of names for multiple columns.
- probabilities: [0, 1] 사이 값. 0 is the minimum, 0.5 is the median, 1 is the maximum.
- relativeError: The relative target precision to achieve (>= 0). If set to zero, the exact quantiles are computed, which could be very expensive. Note that values greater than 1 are accepted but give the same result as 1.

In [29]:
df.approxQuantile('Amount', [0.5], 0)

[0.85]

### pandas DataFrame으로 변경
- pyspark DataFrame에서 pandas DataFrame으로 변경할수 있다.

In [7]:
pd_df = df.toPandas()

In [8]:
pd_df

Unnamed: 0,Name,Amount,Item
0,Jam,5.62,쥬스
1,Ham,0.85,우유


In [9]:
type(pd_df)

pandas.core.frame.DataFrame

In [10]:
type(df)

pyspark.sql.dataframe.DataFrame