In [1]:
#!hdfs dfs -ls -R /twitter/movie/DeerSet9/ | grep -v "^d" | awk '{if ($5 == 0) print $8}'

In [2]:
#!hdfs dfs -rm $(hdfs dfs -ls -R /twitter/movie/DeerSet9/ | grep -v "^d" | awk '{if ($5 == 0) print $8}')

In [3]:
#!hdfs dfs -ls -R /twitter/movie/DeerSet9/ | grep -v "^d" | awk '{if ($5 == 0) print $8}'

In [1]:
dataPath = '/twitter/movie/DeerSet9/'

In [8]:
def loadTwitterData(filePath):
    
    df = spark.read.json(filePath + '*.gz')
    df2 = df.select('body','gnip.matching_rules.tag', \
                    'gnip.matching_rules.value', \
                    'postedTime', 'retweetCount').filter(df.twitter_lang == "en").na.drop()
    df2 = df2.withColumn('date', df2['postedTime'].cast('date'))
    df2 = df2.withColumnRenamed("tag", "movieName")
    df2 = df2.withColumnRenamed("value", "searchPattern")
    return df2

In [9]:
df = loadTwitterData(dataPath)

In [10]:
df.count()

169620

In [11]:
df.columns

['body', 'movieName', 'searchPattern', 'postedTime', 'retweetCount', 'date']

In [10]:
df2.select('tag').distinct().show()

+------------------+
|               tag|
+------------------+
|[captive, sicario]|
|         [captive]|
|         [sicario]|
|[sicario, captive]|
+------------------+



In [32]:
df2.printSchema()

root
 |-- body: string (nullable = true)
 |-- tag: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- value: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- twitter_lang: string (nullable = true)



In [37]:
df3 = df2.withColumn('tag', df2['tag'].cast('string')) \
         .withColumn('value', df2['value'].cast('string')) 

# df3 = df2.withColumn('tag', df2['tag'].getItem(0).cast('string')) \
#          .withColumn('value', df2['value'].getItem(0).cast('string')) 

In [30]:
df3.select('tag').distinct().show()

+-----------------+
|              tag|
+-----------------+
|[sicario,captive]|
|        [captive]|
|        [sicario]|
|[captive,sicario]|
+-----------------+



### Finding Unique Movies in a data

In [72]:
from pyspark.sql.functions import col

In [49]:
movies = df3.select('tag').where(~col('tag').like('%,%')).distinct().collect()

In [87]:
movies2 = [str(i.tag[1:-1]) for i in movies]
print(movies2)


['captive', 'sicario']


In [88]:
df3.rdd.getNumPartitions()

941

In [35]:
df3.select('body').filter(df3.tag.like('%captive,sicario%')).show(10, False)

+--------------------------------------------------------------------------------------------------------------------------------+
|body                                                                                                                            |
+--------------------------------------------------------------------------------------------------------------------------------+
|#Trailers @Captive | i-Lived | The Seventh Dwarf | @SicarioMovie and Daddy's Home http://t.co/mHOqxgeVFt https://t.co/v2Ho4UAPLY|
+--------------------------------------------------------------------------------------------------------------------------------+



In [39]:
df3.select('body').filter(df3.tag.like('%sicario%')).count()

150731

In [40]:
df3.select('body').filter(df3.tag.like('%captive%')).count()

18892