In [1]:
import quandl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm # Using .api imports the public access version of statsmodels, which is a library that handles 
# statistical models.
import os
import warnings # This is a library that handles warnings.

warnings.filterwarnings("ignore") # Disable deprecation warnings that could indicate, for instance, a suspended library or 
# feature. These are more relevant to developers and very seldom to analysts.

plt.style.use('fivethirtyeight') # This is a styling option for how your plots will appear. More examples here:
# https://matplotlib.org/3.2.1/tutorials/introductory/customizing.html
# https://matplotlib.org/3.1.0/gallery/style_sheets/fivethirtyeight.html

In [2]:
# Define path

path = '/Users/macbook/Library/CloudStorage/OneDrive-Personal/Data Analisys/Social Buzz'

In [3]:
df = pd.read_csv(os.path.join(path,'02 Data', 'Prepared Data', 'ModeledDataSet3.1.csv'))

In [4]:
df.head()

Unnamed: 0,F1;Content ID;User ID;Reaction Type;Datetime;Content Type;Category;Sentiment;Score
0,1;97522e57-d9ab-4bd6-97bf-c24d952602d2;5d45458...
1,2;97522e57-d9ab-4bd6-97bf-c24d952602d2;92b87fa...
2,3;97522e57-d9ab-4bd6-97bf-c24d952602d2;163daa3...
3,4;97522e57-d9ab-4bd6-97bf-c24d952602d2;34e8add...
4,5;97522e57-d9ab-4bd6-97bf-c24d952602d2;9b6d35f...


In [5]:
df.shape

(24573, 1)

In [6]:
df.columns

Index(['F1;Content ID;User ID;Reaction Type;Datetime;Content Type;Category;Sentiment;Score'], dtype='object')

In [7]:
df.dtypes

F1;Content ID;User ID;Reaction Type;Datetime;Content Type;Category;Sentiment;Score    object
dtype: object

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24573 entries, 0 to 24572
Data columns (total 1 columns):
 #   Column                                                                              Non-Null Count  Dtype 
---  ------                                                                              --------------  ----- 
 0   F1;Content ID;User ID;Reaction Type;Datetime;Content Type;Category;Sentiment;Score  24573 non-null  object
dtypes: object(1)
memory usage: 192.1+ KB


In [9]:
# Split the single column into multiple columns based on semicolon delimiter
new_columns = df['F1;Content ID;User ID;Reaction Type;Datetime;Content Type;Category;Sentiment;Score'].str.split(';', expand=True)

In [10]:
# Rename the new columns
new_columns.columns = ['F1', 'Content ID', 'User ID', 'Reaction Type', 'Datetime', 'Content Type', 'Category', 'Sentiment', 'Score']

In [11]:
# Replace the original column with the new columns
df = pd.concat([df, new_columns], axis=1)

In [12]:
# Drop the original column if you don't need it anymore
df = df.drop('F1;Content ID;User ID;Reaction Type;Datetime;Content Type;Category;Sentiment;Score', axis=1)


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24573 entries, 0 to 24572
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   F1             24573 non-null  object
 1   Content ID     24573 non-null  object
 2   User ID        24573 non-null  object
 3   Reaction Type  24573 non-null  object
 4   Datetime       24573 non-null  object
 5   Content Type   24573 non-null  object
 6   Category       24573 non-null  object
 7   Sentiment      24573 non-null  object
 8   Score          24573 non-null  object
dtypes: object(9)
memory usage: 1.7+ MB


In [14]:
# Set "F1" column as the index
df = df.set_index("F1")

In [15]:
df.dtypes

Content ID       object
User ID          object
Reaction Type    object
Datetime         object
Content Type     object
Category         object
Sentiment        object
Score            object
dtype: object

In [16]:
df.head()

Unnamed: 0_level_0,Content ID,User ID,Reaction Type,Datetime,Content Type,Category,Sentiment,Score
F1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,7.11.20 09:43,photo,studying,negative,0
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,17.06.21 12:22,photo,studying,negative,10
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,18.04.21 05:13,photo,studying,negative,15
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,6.01.21 19:13,photo,studying,negative,0
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,9b6d35f9-5e15-4cd0-a8d7-b1f3340e02c4,interested,23.08.20 12:25,photo,studying,positive,30


In [17]:
# Check for missing values

df.isnull().sum() # No missing values!

Content ID       0
User ID          0
Reaction Type    0
Datetime         0
Content Type     0
Category         0
Sentiment        0
Score            0
dtype: int64

In [18]:
# Find duplicates

df_dups = df[df.duplicated()]

In [19]:
df_dups.shape # No duplicates!

(0, 8)

In [20]:
for col in df.columns.tolist():
      weird = (df[[col]].applymap(type) != df[[col]].iloc[0].apply(type)).any(axis = 1)
      if len (df[weird]) > 0:
        print (col)

In [24]:
# Convert specific columns to numeric if they contain numeric data
columns_to_convert = ['Score']
df[columns_to_convert] = df[columns_to_convert].apply(pd.to_numeric, errors='coerce')

In [25]:
df.dtypes

Content ID       object
User ID          object
Reaction Type    object
Datetime         object
Content Type     object
Category         object
Sentiment        object
Score             int64
dtype: object

In [27]:
# Convert the Datetime column to timestamps
df['Datetime'] = pd.to_datetime(df['Datetime'])


In [28]:
# Convert timestamps to float64
df['Datetime'] = df['Datetime'].values.astype(float)

In [29]:
# Verify the data type of the Datetime column
print(df.dtypes)

Content ID        object
User ID           object
Reaction Type     object
Datetime         float64
Content Type      object
Category          object
Sentiment         object
Score              int64
dtype: object
