# Twitter Project - Data Cleaning for the Tweet

### Import libraries

In [1]:
%reload_ext lab_black

import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

# nltk.download("stopwords")
# nltk.download("punkt")

### Cleaning for iPhone14

In [2]:
# reading iPhone14 data from csv
df_raw = pd.read_csv("../data/raw/iPhone14x10k_RAW.csv", index_col=0)

In [3]:
# drop duplicates
df_a = df_raw.drop_duplicates(keep="first")
df_b = df_a.copy()

# convert Tweet to lowercase for string comparison
df_b["Tweet"] = df_a["Tweet"].str.lower()

# create a new column for retweet
df_b["retweet"] = df_b["Tweet"].str.contains("rt @")

# create a new column for the keyword #iphone14
df_b["iphone"] = df_b["Tweet"].str.contains("#iphone14")

# drop individuals if Tweet does not contain the hashtag #iPhone14
df_c = df_b[df_b["iphone"] == True]

df_c.shape

(8649, 5)

### Cleaning for iPhone14Pro

In [4]:
# reading iPhone14 data from csv
dfpro_raw = pd.read_csv("../data/raw/iPhone14Prox10k_RAW.csv", index_col=0)

In [5]:
# drop duplicates
dfpro_a = dfpro_raw.drop_duplicates(keep="first")
dfpro_b = dfpro_a.copy()

# convert Tweet to lowercase for string comparison
dfpro_b["Tweet"] = dfpro_a["Tweet"].str.lower()

# create a new column for retweet
dfpro_b["retweet"] = dfpro_b["Tweet"].str.contains("rt @")

# create a new column for the keyword #iphone14
dfpro_b["iphonepro"] = dfpro_b["Tweet"].str.contains("#iphone14pro")

# drop individuals if Tweet does not contain the hashtag #iPhone14Pro
dfpro_c = dfpro_b[dfpro_b["iphonepro"] == True]

dfpro_c.shape

(8103, 5)

### Concatenate iPhone14 and iPhone14Pro DataFrames

In [6]:
df_comb = pd.concat([df_c, dfpro_c]).reset_index(drop=True)
df_comb.shape

(16752, 6)

In [7]:
# replace the missing iphone and iphonepro with False as they are from the other dataset
df = df_comb.copy()
df[["iphone", "iphonepro"]] = df_comb[["iphone", "iphonepro"]].fillna(False)

In [8]:
df.head()

Unnamed: 0,User,DateTime,Tweet,retweet,iphone,iphonepro
0,bosesubash56,2022-09-14 14:17:04+00:00,@ws_mobiletrans #nothingchangedaftertransfer a...,False,True,False
1,you_swerve,2022-09-14 14:15:18+00:00,rt @insanetweet: iphone 8 while upgrading to i...,True,True,False
2,MarkVenaTechGuy,2022-09-14 14:13:51+00:00,my #smarttechcheck newsletter thoughts about t...,False,True,False
3,WinPhanKyle,2022-09-14 14:13:16+00:00,rt @saradietschy: #iphone14 unboxing time!! go...,True,True,False
4,zaheenhafzer,2022-09-14 14:13:04+00:00,rt @zollotech: but first the cases…#iphone14\r...,True,True,False


In [9]:
## create a new column for individuals that are in both iPhone14 and iPhone14Pro DataFrames
# df["overlap"] = df.duplicated(subset=["User", "DateTime", "Tweet"], keep=False)

# drop individuals that are in both iPhone14 and iPhone14Pro DataFrames
df_sub = df.drop_duplicates(subset=["User", "DateTime", "Tweet"], keep=False).copy()

In [10]:
# define a function to clean up the text in Tweet before counting them


def cleanup(text):
    # remove elements that begin with @... or http...
    text1 = re.sub(r"@\S+", "", text)
    text2 = re.sub(r"http\S+", "", text1)

    # remove \r, \n, RT, or &amp;
    text3 = (
        text2.replace("\r", "")
        .replace("\n", " ")
        .replace("RT", "")
        .replace("&amp;", "")
    )

    # remove punctuation
    text4 = "".join([i.lower() for i in text3 if i not in string.punctuation])

    # remove stopwords
    text5 = " ".join(
        [
            word
            for word in word_tokenize(text4)
            if not word in stopwords.words("english")
        ]
    )

    return text5

In [11]:
# define the function to return the list with "n" most common words in Tweet


def tweet_words(n, df_in):

    # combine the Tweet of all individuals
    combitext = " ".join(df_in["Tweet"].values)

    # clean up the combined text
    cleantext = cleanup(combitext)

    # show the list with "n" most common words
    return Counter(cleantext.split()).most_common(n)

In [12]:
#########################################################################################
# The input DataFrame for generating the text count. Just need to change
# the variable name (e.g., iphonepro) and/or the value (e.g., True) of the condition
#########################################################################################

df_input = df_sub[df_sub["iphonepro"] == True]
tweet_words(20, df_input)

[('iphone14pro', 3811),
 ('rt', 2401),
 ('iphone', 1641),
 ('pro', 1476),
 ('new', 1315),
 ('iphone14', 1207),
 ('better', 951),
 ('experience', 931),
 ('travel', 925),
 ('easier', 924),
 ('🏆win', 923),
 ('tourism', 923),
 ('appleevent', 889),
 ('14', 850),
 ('apple', 747),
 ('’', 507),
 ('iphone14promax', 401),
 ('help', 285),
 ('today', 252),
 ('x', 225)]

In [13]:
df_input = df_sub[df_sub["iphone"] == True]
tweet_words(20, df_input)

[('iphone14', 4486),
 ('iphone', 2746),
 ('rt', 2466),
 ('apple', 1453),
 ('ios16', 1127),
 ('14', 1046),
 ('users', 780),
 ('rn', 743),
 ('new', 571),
 ('’', 479),
 ('appleevent', 428),
 ('pro', 301),
 ('ios', 285),
 ('let', 232),
 ('know', 228),
 ('ad', 224),
 ('shot', 218),
 ('want', 207),
 ('features', 203),
 ('first', 187)]

In [14]:
df_input = df_sub[(df_sub["retweet"] == False) & (df_sub["iphonepro"] == True)]
tweet_words(20, df_input)

[('iphone14pro', 1499),
 ('iphone', 804),
 ('apple', 514),
 ('14', 395),
 ('pro', 310),
 ('help', 283),
 ('iphone14promax', 273),
 ('’', 228),
 ('new', 204),
 ('appleevent', 176),
 ('iphone14', 175),
 ('ios16', 172),
 ('someone', 142),
 ('dynamic', 137),
 ('essay', 136),
 ('island', 131),
 ('us', 109),
 ('available', 109),
 ('need', 102),
 ('dynamicisland', 97)]

In [15]:
df_input = df_sub[df_sub["User"] == "Rokibul60100130"]
tweet_words(20, df_input)

[('hay', 160),
 ('want', 160),
 ('create', 160),
 ('cute', 160),
 ('image', 160),
 ('cartoon', 160),
 ('portrait', 160),
 ('visit', 160),
 ('jennie', 160),
 ('lisa', 160),
 ('pakistan', 160),
 ('iphone14', 160),
 ('brahmastra', 160),
 ('xbox', 160),
 ('worldcup2022', 160),
 ('meditation', 160),
 ('chengyi', 160),
 ('rahulvaidya', 160),
 ('jesus', 160),
 ('australia', 160)]