In [1]:
import findspark
findspark.init()

from pyspark.sql import Row
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark import SparkFiles

주의! SparkContext()는 최초 실행 후 다시 실행하면 에러가 발생한다.
> 🤔 왜 ?

In [2]:
sc = SparkContext() 
sqlContext = SQLContext(sc)

In [4]:
url = "https://raw.githubusercontent.com/guru99-edu/R-Programming/master/adult_data.csv"
sc.addFile(url)
sqlContext = SQLContext(sc)

In [5]:
df = sqlContext.read.csv(
    SparkFiles.get("adult_data.csv"), 
    header=True, 
    inferSchema=True
)

In [6]:
df.printSchema()

root
 |-- x: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



### PySpark에서 사용할 수 있는 대표적인 함수를 알아보자

Aggregate on the entire DataFrame without groups
- avg
- count
- max
- mean
- min
- sum

In [7]:
from pyspark.sql import functions as F

df.agg(F.avg('age')).collect()

[Row(avg(age)=38.64358543876172)]

In [62]:
df.agg(F.count('age')).collect()

[Row(count(age)=48842)]

In [63]:
df.agg(F.max('age')).collect()

[Row(max(age)=90)]

In [64]:
df.agg(F.min('age')).collect()

[Row(min(age)=17)]

In [65]:
df.agg(F.sum('age')).collect()

[Row(sum(age)=1887430)]

Qunatile(분위수) 계산

```df.approxQuantile('col', [probabilities], relativeError)```

- col: column name, or a list of names for multiple columns.
- probabilities: [0, 1] 사이 값. 0 is the minimum, 0.5 is the median, 1 is the maximum.
- relativeError: The relative target precision to achieve (>= 0). If set to zero, the exact quantiles are computed, which could be very expensive. Note that values greater than 1 are accepted but give the same result as 1.

In [138]:
df.approxQuantile('age', [0.5], 0)

[37.0]

In [46]:
df.approxQuantile(['age', 'x'], [0.25], 0)

[[28.0], [12211.0]]

SQL ``` SELECT * FROM [TABLE] LIMIT 5```

In [52]:
df.take(5)

[Row(x=1, age=25, workclass='Private', fnlwgt=226802, education='11th', educational-num=7, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='Black', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='<=50K'),
 Row(x=2, age=38, workclass='Private', fnlwgt=89814, education='HS-grad', educational-num=9, marital-status='Married-civ-spouse', occupation='Farming-fishing', relationship='Husband', race='White', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=50, native-country='United-States', income='<=50K'),
 Row(x=3, age=28, workclass='Local-gov', fnlwgt=336951, education='Assoc-acdm', educational-num=12, marital-status='Married-civ-spouse', occupation='Protective-serv', relationship='Husband', race='White', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='>50K'),
 Row(x=4, age=44, workclass='Private', fnlwgt=160323, educ

Column 명을 보고 싶어

In [53]:
df.columns

['x',
 'age',
 'workclass',
 'fnlwgt',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country',
 'income']

### .show() 와 .collect() 차이 
.show() 는 보여주기식이다. 이것을 다음 단계로 넘길 수 없다.

.collect() 는 list로 값을 넘겨준다. 다음 단계로 넘겨 분석을 진행할 수 있다.

In [87]:
df.describe(['fnlwgt']).show()

+-------+------------------+
|summary|            fnlwgt|
+-------+------------------+
|  count|             48842|
|   mean|189664.13459727284|
| stddev|105604.02542315757|
|    min|             12285|
|    max|           1490400|
+-------+------------------+



In [88]:
df.describe(['fnlwgt']).collect()

[Row(summary='count', fnlwgt='48842'),
 Row(summary='mean', fnlwgt='189664.13459727284'),
 Row(summary='stddev', fnlwgt='105604.02542315757'),
 Row(summary='min', fnlwgt='12285'),
 Row(summary='max', fnlwgt='1490400')]

where 조건은 .filter()

In [115]:
df.filter('age > 70').collect()[0]

Row(x=23, age=72, workclass='?', fnlwgt=132015, education='7th-8th', educational-num=4, marital-status='Divorced', occupation='?', relationship='Not-in-family', race='White', gender='Female', capital-gain=0, capital-loss=0, hours-per-week=6, native-country='United-States', income='<=50K')

and 조건 / or 조건

In [135]:
df.filter('age > 70 and age < 80').collect()[0]

Row(x=23, age=72, workclass='?', fnlwgt=132015, education='7th-8th', educational-num=4, marital-status='Divorced', occupation='?', relationship='Not-in-family', race='White', gender='Female', capital-gain=0, capital-loss=0, hours-per-week=6, native-country='United-States', income='<=50K')

In [134]:
df.filter('age < 20 or age > 80').collect()[1]

Row(x=39, age=17, workclass='Private', fnlwgt=269430, education='10th', educational-num=6, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Not-in-family', race='White', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='<=50K')

GROUP BY는 groupBY().

In [77]:
df.groupBy('gender').avg('age').collect()

[Row(gender='Female', avg(age)=36.92798913043478),
 Row(gender='Male', avg(age)=39.49439509954058)]

groupBy 기준 여러개를 설정할 수 있다.

In [108]:
df.groupBy('gender', 'race').avg('age').collect()

[Row(gender='Male', race='White', avg(age)=39.70450669914738),
 Row(gender='Female', race='Asian-Pac-Islander', avg(age)=35.657640232108314),
 Row(gender='Female', race='White', avg(age)=36.88293544177478),
 Row(gender='Female', race='Amer-Indian-Eskimo', avg(age)=36.23783783783784),
 Row(gender='Male', race='Other', avg(age)=35.167330677290835),
 Row(gender='Male', race='Black', avg(age)=37.922591501893145),
 Row(gender='Male', race='Asian-Pac-Islander', avg(age)=38.9940119760479),
 Row(gender='Male', race='Amer-Indian-Eskimo', avg(age)=36.98947368421052),
 Row(gender='Female', race='Other', avg(age)=31.212903225806453),
 Row(gender='Female', race='Black', avg(age)=37.90597920277296)]

물론 구하려는 변수에도 여러개 입력 가능

In [113]:
df.groupBy('gender').avg('age', 'fnlwgt').show()

+------+-----------------+------------------+
|gender|         avg(age)|       avg(fnlwgt)|
+------+-----------------+------------------+
|Female|36.92798913043478|185504.47171442688|
|  Male|39.49439509954058| 191727.0216232772|
+------+-----------------+------------------+



ORDER BY는 .sort()로 할 수 있다. 

desc()로 감싸면 descending 가능

In [104]:
df.groupBy('gender', 'race').avg('age').sort(desc('gender'), 'race').collect()

[Row(gender='Male', race='White', avg(age)=39.70450669914738),
 Row(gender='Male', race='Other', avg(age)=35.167330677290835),
 Row(gender='Male', race='Black', avg(age)=37.922591501893145),
 Row(gender='Male', race='Asian-Pac-Islander', avg(age)=38.9940119760479),
 Row(gender='Male', race='Amer-Indian-Eskimo', avg(age)=36.98947368421052),
 Row(gender='Female', race='White', avg(age)=36.88293544177478),
 Row(gender='Female', race='Other', avg(age)=31.212903225806453),
 Row(gender='Female', race='Black', avg(age)=37.90597920277296),
 Row(gender='Female', race='Asian-Pac-Islander', avg(age)=35.657640232108314),
 Row(gender='Female', race='Amer-Indian-Eskimo', avg(age)=36.23783783783784)]

상위 n개 

In [96]:
df.head(5)

[Row(x=1, age=25, workclass='Private', fnlwgt=226802, education='11th', educational-num=7, marital-status='Never-married', occupation='Machine-op-inspct', relationship='Own-child', race='Black', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='<=50K'),
 Row(x=2, age=38, workclass='Private', fnlwgt=89814, education='HS-grad', educational-num=9, marital-status='Married-civ-spouse', occupation='Farming-fishing', relationship='Husband', race='White', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=50, native-country='United-States', income='<=50K'),
 Row(x=3, age=28, workclass='Local-gov', fnlwgt=336951, education='Assoc-acdm', educational-num=12, marital-status='Married-civ-spouse', occupation='Protective-serv', relationship='Husband', race='White', gender='Male', capital-gain=0, capital-loss=0, hours-per-week=40, native-country='United-States', income='>50K'),
 Row(x=4, age=44, workclass='Private', fnlwgt=160323, educ