In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Repub-Democ-Tweet').getOrCreate()
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print(f'nb of cores used : {cores}')
spark

nb of cores used : 1


In [3]:
tweets = (spark.read
         .format('csv')
         .option('header', 'true')
         .option('inferSchema', 'true')
         .load('./data/Rep_vs_Dem_tweets.csv'))

In [4]:
tweets.limit(5).toPandas()

Unnamed: 0,Party,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,"Congress has allocated about $18…""",,
4,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...


In [5]:
tweets.select('Tweet').show(5, False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|Tweet                                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L |
|RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…|
|RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages.                                    |
|null                                                                                                                                        |

In [6]:
tweets.printSchema()

root
 |-- Party: string (nullable = true)
 |-- Handle: string (nullable = true)
 |-- Tweet: string (nullable = true)



In [7]:
from pyspark.sql.functions import * # regexp_extract, regexp_replace

In [8]:
latino = tweets.withColumn('Latino_Mentions', 
                           regexp_extract(tweets['Tweet'], 
                           "(.)(@LatinoLeader)(.)", 2))
latino.limit(7).toPandas()

Unnamed: 0,Party,Handle,Tweet,Latino_Mentions
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P...",
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...,
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,
3,"Congress has allocated about $18…""",,,
4,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,@LatinoLeader
5,Democrat,RepDarrenSoto,RT @Vegalteno: Hurricane season starts on June...,
6,Democrat,RepDarrenSoto,RT @EmgageActionFL: Thank you to all who came ...,


In [9]:
counts = tweets.groupBy('Party').count()
counts.orderBy(desc('count')).show(20, False)

+-----------------------------------------+-----+
|Party                                    |count|
+-----------------------------------------+-----+
|Republican                               |44392|
|Democrat                                 |42068|
|That’s…"                                 |28   |
|https://t.co/oc6JNAF5K5                  |22   |
|Now                                      |17   |
|Today                                    |13   |
|https://t…"                              |12   |
|http…"                                   |12   |
|h…"                                      |12   |
|❌ E…"                                    |11   |
|❌ Passed #TaxScam                        |11   |
|#mepolitics"                             |11   |
|❌ Terminated #DACA                       |11   |
|❌ Abandoned Hispanic outreach for #ACA   |11   |
|https://t.co/8htzynw0mp"                 |9    |
|https://t.co…"                           |9    |
|💻 Website: https://t.co/dqyGOuzN…"      |9    |
|

In [10]:
from pyspark.sql.functions import when

In [11]:
democ = (tweets.withColumn('Party', 
                            when(tweets['Party'] == 'Democrat', 'Democrat')
                            .when(tweets.Party == 'Republican', 'Republican')
                            .otherwise('Other')))
democ = democ.groupBy('Party').count()
democ.orderBy(desc('count')).show(5)

+----------+-----+
|     Party|count|
+----------+-----+
|Republican|44392|
|  Democrat|42068|
|     Other| 6029|
+----------+-----+



In [13]:
print('DG Tweet')
tweets.select('tweet').show(2, False)

DG Tweet
+--------------------------------------------------------------------------------------------------------------------------------------------+
|tweet                                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L |
|RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…|
+--------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 2 rows



In [31]:
print('Cleaned Tweet')
(tweets.withColumn('Cleaned', 
        regexp_replace('Tweet', 
                       "(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+)) \
                       ([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", ""))
 .select('Cleaned')
 .show(1, False))

Cleaned Tweet
+-------------------------------------------------------------------------------------------------------------------------------------------+
|Cleaned                                                                                                                                    |
+-------------------------------------------------------------------------------------------------------------------------------------------+
|Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L|
+-------------------------------------------------------------------------------------------------------------------------------------------+
only showing top 1 row



In [23]:
tweets.select('Tweet').show(5, False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|Tweet                                                                                                                                       |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L |
|RT @WinterHavenSun: Winter Haven resident / Alta Vista teacher is one of several recognized by @RepDarrenSoto for National Teacher Apprecia…|
|RT @NBCLatino: .@RepDarrenSoto noted that Hurricane Maria has left approximately $90 billion in damages.                                    |
|null                                                                                                                                        |

In [26]:
tweets.select('Tweet', trim(tweets['Tweet'])).limit(7).toPandas()

Unnamed: 0,Tweet,trim(Tweet)
0,"Today, Senate Dems vote to #SaveTheInternet. P...","Today, Senate Dems vote to #SaveTheInternet. P..."
1,RT @WinterHavenSun: Winter Haven resident / Al...,RT @WinterHavenSun: Winter Haven resident / Al...
2,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,,
4,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
5,RT @Vegalteno: Hurricane season starts on June...,RT @Vegalteno: Hurricane season starts on June...
6,RT @EmgageActionFL: Thank you to all who came ...,RT @EmgageActionFL: Thank you to all who came ...


In [32]:
renamed = tweets.withColumnRenamed('Party', 'Dem_Rep')
renamed.limit(5).toPandas()

Unnamed: 0,Dem_Rep,Handle,Tweet
0,Democrat,RepDarrenSoto,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RepDarrenSoto,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RepDarrenSoto,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,"Congress has allocated about $18…""",,
4,Democrat,RepDarrenSoto,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...


In [35]:
tweets.select(tweets['Party'], tweets.Handle, concat_ws(' ', tweets.Party, tweets['Handle']).alias('Concatened')).show(5, False)

+----------------------------------+-------------+----------------------------------+
|Party                             |Handle       |Concatened                        |
+----------------------------------+-------------+----------------------------------+
|Democrat                          |RepDarrenSoto|Democrat RepDarrenSoto            |
|Democrat                          |RepDarrenSoto|Democrat RepDarrenSoto            |
|Democrat                          |RepDarrenSoto|Democrat RepDarrenSoto            |
|Congress has allocated about $18…"|null         |Congress has allocated about $18…"|
|Democrat                          |RepDarrenSoto|Democrat RepDarrenSoto            |
+----------------------------------+-------------+----------------------------------+
only showing top 5 rows

