In [1]:
!pip install pyspark==3.5.3

Collecting pyspark==3.5.3
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840629 sha256=7692b1ba79d581f1ff28b91b1035e59dd78f226d59d2478ecf936670b35f4861
  Stored in directory: /root/.cache/pip/wheels/07/a0/a3/d24c94bf043ab5c7e38c30491199a2a11fef8d2584e6df7fb7
Successfully built pyspark
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.1
    Uninstalling pyspark-3.5.1:
      Successfully uninstalled pyspark-3.5.1
Successfully installed pyspark-3.5.3


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.jars", "/usr/local/lib/python3.7/dist-packages/pyspark/jars/RedshiftJDBC42-no-awssdk-1.2.20.1043.jar") \
    .getOrCreate()

In [3]:
!wget https://s3-geospatial.s3-us-west-2.amazonaws.com/survey_results_public.csv

--2025-11-21 02:47:53--  https://s3-geospatial.s3-us-west-2.amazonaws.com/survey_results_public.csv
Resolving s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)... 3.5.85.161, 52.218.250.65, 3.5.85.241, ...
Connecting to s3-geospatial.s3-us-west-2.amazonaws.com (s3-geospatial.s3-us-west-2.amazonaws.com)|3.5.85.161|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81101949 (77M) [text/csv]
Saving to: ‘survey_results_public.csv’


2025-11-21 02:47:56 (34.3 MB/s) - ‘survey_results_public.csv’ saved [81101949/81101949]



In [4]:
!ls -tl

total 79212
drwxr-xr-x 1 root root     4096 Nov 17 14:29 sample_data
-rw-r--r-- 1 root root 81101949 Jan 15  2023 survey_results_public.csv


In [5]:
!head -5 survey_results_public.csv

ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,YearsCodePro,DevType,OrgSize,Currency,CompTotal,CompFreq,LanguageHaveWorkedWith,LanguageWantToWorkWith,DatabaseHaveWorkedWith,DatabaseWantToWorkWith,PlatformHaveWorkedWith,PlatformWantToWorkWith,WebframeHaveWorkedWith,WebframeWantToWorkWith,MiscTechHaveWorkedWith,MiscTechWantToWorkWith,ToolsTechHaveWorkedWith,ToolsTechWantToWorkWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsWantToWorkWith,OpSys,NEWStuck,NEWSOSites,SOVisitFreq,SOAccount,SOPartFreq,SOComm,NEWOtherComms,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
1,I am a developer by profession,"Independent contractor, freelancer, or self-employed",Slovakia,NA,NA,"Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)",18 - 24 years,"Coding Bootcamp;Other online resources (ex: videos, blogs, etc)",NA,NA,"Developer, mobile",20 to 99 employees,EUR Eu

In [6]:
df = spark.read.csv("survey_results_public.csv", header=True).select('ResponseId', 'LanguageHaveWorkedWith', 'LanguageWantToWorkWith')

In [7]:
df.printSchema()

root
 |-- ResponseId: string (nullable = true)
 |-- LanguageHaveWorkedWith: string (nullable = true)
 |-- LanguageWantToWorkWith: string (nullable = true)



In [8]:
import pyspark.sql.functions as F

# LanguageHaveWorkedWith 값을 트림하고 ;를 가지고 나눠서 리스트의 형태로 language_have 필드로 설정
df2 = df.withColumn(
    "language_have",
    F.split(F.trim(F.col("LanguageHaveWorkedWith")), ";")
)

In [9]:
df2.show(5)

+----------+----------------------+----------------------+--------------------+
|ResponseId|LanguageHaveWorkedWith|LanguageWantToWorkWith|       language_have|
+----------+----------------------+----------------------+--------------------+
|         1|  C++;HTML/CSS;Java...|                 Swift|[C++, HTML/CSS, J...|
|         2|     JavaScript;Python|                    NA|[JavaScript, Python]|
|         3|  Assembly;C;Python...|     Julia;Python;Rust|[Assembly, C, Pyt...|
|         4|  JavaScript;TypeSc...|  JavaScript;TypeSc...|[JavaScript, Type...|
|         5|  Bash/Shell;HTML/C...|  Bash/Shell;HTML/C...|[Bash/Shell, HTML...|
+----------+----------------------+----------------------+--------------------+
only showing top 5 rows



In [10]:
# LanguageWantToWorkWith 값을 트림하고 ;를 가지고 나눠서 리스트의 형태로 language_want 필드로 설정
df3 = df2.withColumn(
    "language_want",
    F.split(F.trim(F.col("LanguageWantToWorkWith")), ";")
)

In [11]:
df3.printSchema()

root
 |-- ResponseId: string (nullable = true)
 |-- LanguageHaveWorkedWith: string (nullable = true)
 |-- LanguageWantToWorkWith: string (nullable = true)
 |-- language_have: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- language_want: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [12]:
df3.show(5)

+----------+----------------------+----------------------+--------------------+--------------------+
|ResponseId|LanguageHaveWorkedWith|LanguageWantToWorkWith|       language_have|       language_want|
+----------+----------------------+----------------------+--------------------+--------------------+
|         1|  C++;HTML/CSS;Java...|                 Swift|[C++, HTML/CSS, J...|             [Swift]|
|         2|     JavaScript;Python|                    NA|[JavaScript, Python]|                [NA]|
|         3|  Assembly;C;Python...|     Julia;Python;Rust|[Assembly, C, Pyt...|[Julia, Python, R...|
|         4|  JavaScript;TypeSc...|  JavaScript;TypeSc...|[JavaScript, Type...|[JavaScript, Type...|
|         5|  Bash/Shell;HTML/C...|  Bash/Shell;HTML/C...|[Bash/Shell, HTML...|[Bash/Shell, HTML...|
+----------+----------------------+----------------------+--------------------+--------------------+
only showing top 5 rows



## 현재 많이 사용되는 언어들 찾기

In [13]:
df_language_have = df3.select(
    df3.ResponseId,
    F.explode(df3.language_have).alias("language_have")
)

In [14]:
df_language_have.show(10)

+----------+-------------+
|ResponseId|language_have|
+----------+-------------+
|         1|          C++|
|         1|     HTML/CSS|
|         1|   JavaScript|
|         1|  Objective-C|
|         1|          PHP|
|         1|        Swift|
|         2|   JavaScript|
|         2|       Python|
|         3|     Assembly|
|         3|            C|
+----------+-------------+
only showing top 10 rows



In [15]:
df_language_have.groupby("language_have").count().show(10)

+-------------+-----+
|language_have|count|
+-------------+-----+
|           C#|22984|
|          VBA| 3847|
|         Rust| 5799|
|   Bash/Shell|22385|
|   JavaScript|53587|
|           NA| 1082|
|         Perl| 2028|
|       Erlang|  651|
|       Matlab| 3846|
|      Crystal|  466|
+-------------+-----+
only showing top 10 rows



Sorting 두 가지 방법:

*   sort & orderBy
*   ascending & descending

In [16]:
df_language_have.groupby("language_have").count().sort(F.desc("count")).collect()

[Row(language_have='JavaScript', count=53587),
 Row(language_have='HTML/CSS', count=46259),
 Row(language_have='Python', count=39792),
 Row(language_have='SQL', count=38835),
 Row(language_have='Java', count=29162),
 Row(language_have='Node.js', count=27975),
 Row(language_have='TypeScript', count=24909),
 Row(language_have='C#', count=22984),
 Row(language_have='Bash/Shell', count=22385),
 Row(language_have='C++', count=20057),
 Row(language_have='PHP', count=18130),
 Row(language_have='C', count=17329),
 Row(language_have='PowerShell', count=8871),
 Row(language_have='Go', count=7879),
 Row(language_have='Kotlin', count=6866),
 Row(language_have='Rust', count=5799),
 Row(language_have='Ruby', count=5569),
 Row(language_have='Dart', count=4965),
 Row(language_have='Assembly', count=4632),
 Row(language_have='Swift', count=4204),
 Row(language_have='R', count=4185),
 Row(language_have='VBA', count=3847),
 Row(language_have='Matlab', count=3846),
 Row(language_have='Groovy', count=2479)

In [17]:
df_language_have.groupby("language_have").count().orderBy('count', ascending=False).collect()

[Row(language_have='JavaScript', count=53587),
 Row(language_have='HTML/CSS', count=46259),
 Row(language_have='Python', count=39792),
 Row(language_have='SQL', count=38835),
 Row(language_have='Java', count=29162),
 Row(language_have='Node.js', count=27975),
 Row(language_have='TypeScript', count=24909),
 Row(language_have='C#', count=22984),
 Row(language_have='Bash/Shell', count=22385),
 Row(language_have='C++', count=20057),
 Row(language_have='PHP', count=18130),
 Row(language_have='C', count=17329),
 Row(language_have='PowerShell', count=8871),
 Row(language_have='Go', count=7879),
 Row(language_have='Kotlin', count=6866),
 Row(language_have='Rust', count=5799),
 Row(language_have='Ruby', count=5569),
 Row(language_have='Dart', count=4965),
 Row(language_have='Assembly', count=4632),
 Row(language_have='Swift', count=4204),
 Row(language_have='R', count=4185),
 Row(language_have='VBA', count=3847),
 Row(language_have='Matlab', count=3846),
 Row(language_have='Groovy', count=2479)

In [18]:
df_language50_have = df_language_have.groupby("language_have")\
    .count()\
    .orderBy('count', ascending=False)\
    .limit(50)

In [19]:
df_language50_have.write.mode('overwrite').csv("language50_have")

In [20]:
!ls -tl

total 79216
drwxr-xr-x 2 root root     4096 Nov 21 02:54 language50_have
drwxr-xr-x 1 root root     4096 Nov 17 14:29 sample_data
-rw-r--r-- 1 root root 81101949 Jan 15  2023 survey_results_public.csv


In [21]:
!ls -tl language50_have/

total 4
-rw-r--r-- 1 root root   0 Nov 21 02:54 _SUCCESS
-rw-r--r-- 1 root root 447 Nov 21 02:54 part-00000-3f4b6928-117e-4ea6-87e5-9ccffb983abe-c000.csv


In [22]:
!cat language50_have/part-00000-3f4b6928-117e-4ea6-87e5-9ccffb983abe-c000.csv


JavaScript,53587
HTML/CSS,46259
Python,39792
SQL,38835
Java,29162
Node.js,27975
TypeScript,24909
C#,22984
Bash/Shell,22385
C++,20057
PHP,18130
C,17329
PowerShell,8871
Go,7879
Kotlin,6866
Rust,5799
Ruby,5569
Dart,4965
Assembly,4632
Swift,4204
R,4185
VBA,3847
Matlab,3846
Groovy,2479
Objective-C,2310
Scala,2148
Perl,2028
Haskell,1749
Delphi,1731
Clojure,1552
Elixir,1438
LISP,1096
NA,1082
Julia,1068
F#,804
Erlang,651
APL,536
Crystal,466
COBOL,437


## 가장 배우고 싶은 언어들 찾기

In [23]:
df_language_want = df3.select(
    df3.ResponseId,
    F.explode(df3.language_want).alias("language_want")
)

In [24]:
df_language_want.show(5)

+----------+-------------+
|ResponseId|language_want|
+----------+-------------+
|         1|        Swift|
|         2|           NA|
|         3|        Julia|
|         3|       Python|
|         3|         Rust|
+----------+-------------+
only showing top 5 rows



In [25]:
df_language_want.groupby("language_want").count().show(10)

+-------------+-----+
|language_want|count|
+-------------+-----+
|           C#|17999|
|          VBA| 1069|
|         Rust|15865|
|   Bash/Shell|14043|
|   JavaScript|37008|
|           NA| 6618|
|         Perl| 1175|
|       Erlang| 1379|
|       Matlab| 1562|
|      Crystal|  790|
+-------------+-----+
only showing top 10 rows



In [26]:
df_language50_want = df_language_want.groupby("language_want").count().orderBy('count', ascending=False).limit(50)

In [27]:
df_language50_want.show(10)

+-------------+-----+
|language_want|count|
+-------------+-----+
|   JavaScript|37008|
|       Python|34929|
|     HTML/CSS|29353|
|   TypeScript|26905|
|          SQL|26631|
|      Node.js|24100|
|           C#|17999|
|         Java|17222|
|         Rust|15865|
|           Go|15788|
+-------------+-----+
only showing top 10 rows



In [28]:
df_language50_want.write.mode('overwrite').csv("language50_want")

In [29]:
!ls -tl language50_want/

total 4
-rw-r--r-- 1 root root   0 Nov 21 02:55 _SUCCESS
-rw-r--r-- 1 root root 449 Nov 21 02:55 part-00000-7ef499da-7335-4837-812c-6ac3306eb596-c000.csv


In [30]:
!cat language50_want/part-00000-7ef499da-7335-4837-812c-6ac3306eb596-c000.csv

JavaScript,37008
Python,34929
HTML/CSS,29353
TypeScript,26905
SQL,26631
Node.js,24100
C#,17999
Java,17222
Rust,15865
Go,15788
C++,15249
Bash/Shell,14043
Kotlin,10691
C,9702
PHP,8852
Dart,7018
NA,6618
Swift,6353
Ruby,4942
PowerShell,4896
R,4015
Assembly,3578
Haskell,3453
Elixir,3374
Scala,3165
Julia,2445
Clojure,2413
F#,2157
Matlab,1562
LISP,1513
Objective-C,1400
Erlang,1379
Groovy,1177
Perl,1175
VBA,1069
Delphi,975
Crystal,790
APL,568
COBOL,309
