# Joins in `pyspark`

Performed with `df_left.join(df_right, how=type_str)`

In [1]:
from pyspark.sql import SparkSession
from more_pyspark import to_pandas
spark = SparkSession.builder.appName('Ops').getOrCreate()
deptk = spark.read.csv("./data/department.csv",  header=True, inferSchema=True)
deptk.collect() >> to_pandas

Unnamed: 0,DeptID,DeptName
0,31,Sales
1,33,Engineering
2,34,Clerical
3,35,Marketing


In [2]:
emplk = spark.read.csv("./data/employee.csv",  header=True, inferSchema=True)
emplk.collect() >> to_pandas

Unnamed: 0,LastName,DeptID
0,Rafferty,31.0
1,Jones,33.0
2,Heisenberg,33.0
3,Robinson,34.0
4,Smith,34.0
5,Williams,


#### Inner join

In [3]:
(emplk
 .join(deptk, emplk.DeptID == deptk.DeptID, how='inner')
 .collect()
) >> to_pandas

Unnamed: 0,LastName,DeptID,DeptName
0,Rafferty,31,Sales
1,Jones,33,Engineering
2,Heisenberg,33,Engineering
3,Robinson,34,Clerical
4,Smith,34,Clerical


#### Left join

In [4]:
(emplk
 .join(deptk, emplk.DeptID == deptk.DeptID, how='left')
 .collect()
) >> to_pandas

Unnamed: 0,LastName,DeptID,DeptName
0,Rafferty,31.0,Sales
1,Jones,33.0,Engineering
2,Heisenberg,33.0,Engineering
3,Robinson,34.0,Clerical
4,Smith,34.0,Clerical
5,Williams,,


#### Right join

In [5]:
(emplk
 .join(deptk, emplk.DeptID == deptk.DeptID, how='right')
 .collect()
) >> to_pandas

Unnamed: 0,LastName,DeptID,DeptName
0,Rafferty,31,Sales
1,Heisenberg,33,Engineering
2,Jones,33,Engineering
3,Smith,34,Clerical
4,Robinson,34,Clerical
5,,35,Marketing


#### Outer join

In [6]:
(emplk
 .join(deptk, emplk.DeptID == deptk.DeptID, how='outer')
 .collect()
) >> to_pandas

Unnamed: 0,LastName,DeptID,DeptName
0,Rafferty,31.0,Sales
1,Robinson,34.0,Clerical
2,Smith,34.0,Clerical
3,Williams,,
4,,35.0,Marketing
5,Jones,33.0,Engineering
6,Heisenberg,33.0,Engineering


## <font color="red"> Exercise 2 </font>

Determine all the players that have hit more than 100 home runs in a season.  The final table should include the players proper name, as well as the team name.  

**Hint:** You will need join the files listed below.  To get credit for this exercise, use the join `pyspark` join methods presented above.

In [7]:
from pyspark.sql.functions import *
paths = ("./data/baseball/core/Batting.csv", 
         "./data/baseball/core/People.csv",
         "./data/baseball/core/Teams.csv")

batting, people, teams = [spark.read.csv(f, header=True, inferSchema=True) for f in paths]

In [8]:
batting.collect() >> to_pandas

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107424,zimmejo02,2019,1,DET,AL,23,2,0,0,0,...,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0
107425,zimmeky01,2019,1,KCA,AL,15,0,0,0,0,...,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
107426,zimmery01,2019,1,WAS,NL,52,171,20,44,9,...,27.0,0.0,0.0,17,39.0,0.0,0.0,0.0,2.0,4.0
107427,zobribe01,2019,1,CHN,NL,47,150,24,39,5,...,17.0,0.0,0.0,23,24.0,0.0,1.0,0.0,2.0,6.0


In [11]:
batting_players = (batting
                   .select(col('playerID'), col('teamID'), col('yearID'))
                   .collect()
                  ) >> to_pandas

batting_players.head()

Unnamed: 0,playerID,teamID,yearID
0,abercda01,TRO,1871
1,addybo01,RC1,1871
2,allisar01,CL1,1871
3,allisdo01,WS3,1871
4,ansonca01,RC1,1871


In [13]:
people.collect() >> to_pandas

Unnamed: 0,playerID,birthYear,birthMonth,birthDay,birthCountry,birthState,birthCity,deathYear,deathMonth,deathDay,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,1981.0,12.0,27.0,USA,CO,Denver,,,,...,Aardsma,David Allan,215.0,75.0,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,1934.0,2.0,5.0,USA,AL,Mobile,,,,...,Aaron,Henry Louis,180.0,72.0,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,1939.0,8.0,5.0,USA,AL,Mobile,1984.0,8.0,16.0,...,Aaron,Tommie Lee,190.0,75.0,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,1954.0,9.0,8.0,USA,CA,Orange,,,,...,Aase,Donald William,190.0,75.0,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,1972.0,8.0,25.0,USA,FL,Palm Beach,,,,...,Abad,Fausto Andres,184.0,73.0,L,L,2001-09-10,2006-04-13,abada001,abadan01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20085,zupofr01,1939.0,8.0,29.0,USA,CA,San Francisco,2005.0,3.0,25.0,...,Zupo,Frank Joseph,182.0,71.0,L,R,1957-07-01,1961-05-09,zupof101,zupofr01
20086,zuvelpa01,1958.0,10.0,31.0,USA,CA,San Mateo,,,,...,Zuvella,Paul,173.0,72.0,R,R,1982-09-04,1991-05-02,zuvep001,zuvelpa01
20087,zuverge01,1924.0,8.0,20.0,USA,MI,Holland,2014.0,9.0,8.0,...,Zuverink,George,195.0,76.0,R,R,1951-04-21,1959-06-15,zuveg101,zuverge01
20088,zwilldu01,1888.0,11.0,2.0,USA,MO,St. Louis,1978.0,3.0,27.0,...,Zwilling,Edward Harrison,160.0,66.0,L,L,1910-08-14,1916-07-12,zwild101,zwilldu01


In [26]:
people_info = (people
               .select(col('playerID'), col('nameFirst'), col('nameLast'))
               .collect()
              )

people_info >> to_pandas

Unnamed: 0,playerID,nameFirst,nameLast
0,aardsda01,David,Aardsma
1,aaronha01,Hank,Aaron
2,aaronto01,Tommie,Aaron
3,aasedo01,Don,Aase
4,abadan01,Andy,Abad
...,...,...,...
20085,zupofr01,Frank,Zupo
20086,zuvelpa01,Paul,Zuvella
20087,zuverge01,George,Zuverink
20088,zwilldu01,Dutch,Zwilling


In [12]:
teams.collect() >> to_pandas

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,1871,,BS1,BNA,,3,31,,20,10,...,24,0.834,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1
1,1871,,CH1,CNA,,2,28,,19,9,...,16,0.829,Chicago White Stockings,Union Base-Ball Grounds,,104,102,CHI,CH1,CH1
2,1871,,CL1,CFC,,8,29,,10,19,...,15,0.818,Cleveland Forest Citys,National Association Grounds,,96,100,CLE,CL1,CL1
3,1871,,FW1,KEK,,7,19,,7,12,...,8,0.803,Fort Wayne Kekiongas,Hamilton Field,,101,107,KEK,FW1,FW1
4,1871,,NY2,NNA,,5,33,,16,17,...,14,0.840,New York Mutuals,Union Grounds (Brooklyn),,90,88,NYU,NY2,NY2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2920,2019,NL,SLN,STL,C,1,162,81.0,91,71,...,168,0.989,St. Louis Cardinals,Busch Stadium III,3480393.0,98,97,STL,SLN,SLN
2921,2019,AL,TBA,TBD,E,2,162,81.0,96,66,...,126,0.985,Tampa Bay Rays,Tropicana Field,1178735.0,97,96,TBR,TBA,TBA
2922,2019,AL,TEX,TEX,W,3,162,81.0,78,84,...,143,0.982,Texas Rangers,Globe Life Park in Arlington,2132994.0,111,112,TEX,TEX,TEX
2923,2019,AL,TOR,TOR,E,4,162,81.0,67,95,...,141,0.984,Toronto Blue Jays,Rogers Centre,1750114.0,97,98,TOR,TOR,TOR


In [23]:
team_info = (teams
             .select(col('teamID'), col('yearID'), col('name'), col('HR'))
             .where(col('HR')>100)
             .collect()
            ) 

team_info >> to_pandas

Unnamed: 0,teamID,yearID,name,HR
0,CHN,1884,Chicago White Stockings,142
1,BSN,1894,Boston Beaneaters,103
2,NYA,1920,New York Yankees,115
3,NYA,1921,New York Yankees,134
4,PHA,1922,Philadelphia Athletics,111
...,...,...,...,...
1609,SLN,2019,St. Louis Cardinals,210
1610,TBA,2019,Tampa Bay Rays,217
1611,TEX,2019,Texas Rangers,223
1612,TOR,2019,Toronto Blue Jays,247


In [22]:
join_1 = (team_info
          .join(batting_players, team_info.yearID == batting_players.yearID, how='left')
          .collect()
         )



ValueError: Can only compare identically-labeled Series objects