In [82]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
color_pal = sns.color_palette()
import xgboost as xgb
from datetime import datetime
from pyspark.sql.types import DateType
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import col, to_timestamp
from pyspark.sql.functions import to_date, unix_timestamp, from_unixtime, col
from pyspark.sql import SparkSession

In [83]:
sc

In [124]:
custom_schema = StructType([
    StructField("index", StringType(), True), 
    StructField("id", StringType(), True), 
    StructField("date", StringType(), True), 
    StructField("flag", StringType(), True),
    StructField("user", StringType(), True),
    StructField("text", StringType(), True)])
    

df = spark.read.csv("hdfs://localhost:9000/sharesprk1/ProjectTweets.csv", header=False, schema=custom_schema)
df.show()

+-----+----------+--------------------+--------+---------------+--------------------+
|index|        id|                date|    flag|           user|                text|
+-----+----------+--------------------+--------+---------------+--------------------+
|    0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|    1|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|    2|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|    3|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|    4|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|    5|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|    6|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|    7|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|    8|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4

In [125]:
df = df.withColumn("datetime", to_timestamp(col("date"), "EEE MMM dd HH:mm:ss zzz yyyy"))
columns_to_drop = ['date', 'flag', 'id']
df = df.drop(*columns_to_drop)
df.show(truncate=True)


+-----+---------------+--------------------+-------------------+
|index|           user|                text|           datetime|
+-----+---------------+--------------------+-------------------+
|    0|_TheSpecialOne_|@switchfoot http:...|2009-04-07 05:19:45|
|    1|  scotthamilton|is upset that he ...|2009-04-07 05:19:49|
|    2|       mattycus|@Kenichan I dived...|2009-04-07 05:19:53|
|    3|        ElleCTF|my whole body fee...|2009-04-07 05:19:57|
|    4|         Karoli|@nationwideclass ...|2009-04-07 05:19:57|
|    5|       joy_wolf|@Kwesidei not the...|2009-04-07 05:20:00|
|    6|        mybirch|         Need a hug |2009-04-07 05:20:03|
|    7|           coZZ|@LOLTrish hey  lo...|2009-04-07 05:20:03|
|    8|2Hood4Hollywood|@Tatiana_K nope t...|2009-04-07 05:20:05|
|    9|        mimismo|@twittera que me ...|2009-04-07 05:20:09|
|   10| erinx3leannexo|spring break in p...|2009-04-07 05:20:16|
|   11|   pardonlauren|I just re-pierced...|2009-04-07 05:20:17|
|   12|           TLeC|@c

In [126]:
df.printSchema()

root
 |-- index: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)
 |-- datetime: timestamp (nullable = true)



In [127]:
# Equivalent to reindex is to sort by the new timestamp column
df = df.orderBy("datetime")

In [128]:
df.show()

+-----+---------------+--------------------+-------------------+
|index|           user|                text|           datetime|
+-----+---------------+--------------------+-------------------+
|    0|_TheSpecialOne_|@switchfoot http:...|2009-04-07 05:19:45|
|    1|  scotthamilton|is upset that he ...|2009-04-07 05:19:49|
|    2|       mattycus|@Kenichan I dived...|2009-04-07 05:19:53|
|    3|        ElleCTF|my whole body fee...|2009-04-07 05:19:57|
|    4|         Karoli|@nationwideclass ...|2009-04-07 05:19:57|
|    5|       joy_wolf|@Kwesidei not the...|2009-04-07 05:20:00|
|    7|           coZZ|@LOLTrish hey  lo...|2009-04-07 05:20:03|
|    6|        mybirch|         Need a hug |2009-04-07 05:20:03|
|    8|2Hood4Hollywood|@Tatiana_K nope t...|2009-04-07 05:20:05|
|    9|        mimismo|@twittera que me ...|2009-04-07 05:20:09|
|   10| erinx3leannexo|spring break in p...|2009-04-07 05:20:16|
|   11|   pardonlauren|I just re-pierced...|2009-04-07 05:20:17|
|   13|robrobbierobert|@o

In [129]:
# Print the number of rows with missing values
num_rows_with_missing = df.filter(col("Date").isNull() | col("ID").isNull()).count()
total_rows = df.count()

print(f'Number of rows with missing values: {num_rows_with_missing / total_rows:.2%}')

Number of rows with missing values: 0.00%
