In [None]:
import time
import pandas as pd
from itertools import islice

In [None]:
from google.cloud import storage

In [None]:
# List all files in given COS directory
def list_blobs(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    for blob in blobs:
        print(blob.name + '\t' + str(blob.size))

In [None]:
# List all files in given COS directory
def list_blobs_pd(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    blob_name = []
    blob_size = []
    
    for blob in blobs:
        blob_name.append(blob.name)
        blob_size.append(blob.size)

    blobs_df = pd.DataFrame(list(zip(blob_name, blob_size)), columns=['Name','Size'])

    blobs_df = blobs_df.style.format({"Size": "{:,.0f}"}) 
    
    return blobs_df

In [None]:
# Delete folder from COS bucket
def delete_folder(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))

    for blob in blobs:
        blob.delete()

In [None]:
# Reading data from open bucket, avaible to all students
bucket_read = 'msca-bdp-tweets'

# Saving results into individual bucket, students must update to their own bucket `msca-bdp-students-bucket` and use `CNET ID` as a folder prefix
bucket_write = 'msca-bdp-tweets'
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Twitter Data Analysis") \
    .getOrCreate()

In [None]:
# Load the raw data
df = spark.read.json("gs://msca-bdp-tweets/final_project/*.json")

# Descriptive statistics
df.describe().show()

In [None]:
%%time

!hadoop fs -ls "gs://msca-bdp-tweets/final_project"

In [None]:
from pyspark.sql.functions import col

# Filter out irrelevant tweets to focus on those directly related to primary, secondary, or higher education
relevant_keywords = ["education", "school", "university", "learning", "knowledge"]
df_filtered = df.filter(col("text").isin(relevant_keywords))

# Perform exploratory data analysis (EDA) to identify relevant variables for profiling Twitter users
relevant_columns = ["user", "text", "retweet_count", "favorite_count", "created_at"]

# Continue with the rest of the code for visualization...


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the distribution of tweet counts by user
tweet_counts = df_filtered.groupBy('user').count().select('count').rdd.flatMap(lambda x: x).collect()
plt.figure(figsize=(10, 6))
sns.histplot(tweet_counts, bins=20, kde=True)
plt.xlabel('Number of Tweets')
plt.ylabel('Count')
plt.title('Distribution of Tweet Counts by User')
plt.show()

# Visualize the geographical distribution of Twitter users
user_locations = df_filtered.groupBy('user').agg({'user_location': 'first'}).toPandas()
plt.figure(figsize=(10, 6))
sns.countplot(y=user_locations['user_location'], order=user_locations['user_location'].value_counts().index[:10])
plt.xlabel('Count')
plt.ylabel('User Location')
plt.title('Top 10 User Locations')
plt.show()
