In [24]:

# IMPORT REQUIRED PACKAGES

import pandas as pd
import numpy as np
from IPython.display import display, HTML

#------------------INPUT-------------------#

# INSERT FILENAME
file_name = 'Tweets_Chatgpt_2023.csv'

#------------------------------------------#

def count_special_characters(df):
    special_characters = r'!?-\|:;#@()+-="{}[]*$%^&€£/~`'
    special_char_count = {}
    columns = df.columns

    for column in columns:
        special_char_count[column] = 0
        for value in df[column]:
            if any(char in special_characters for char in str(value)):
                special_char_count[column] += 1

    return [count for _, count in special_char_count.items()]


def count_rows_with_whitespace(df):
    counts = {}
    for column in df.columns:
        count = df[column].apply(lambda value: isinstance(value, str) and r'\s{2,}' in value).sum()
        counts[column] = count
    return counts.values()


def count_rows_with_breaks(df):
    counts = {}
    for column in df.columns:
        count = df[column].apply(lambda value: isinstance(value, str) and '\n' in value).sum()
        counts[column] = count
    return counts.values()


def format_numbers(value):
    if isinstance(value, (float, int)):
        return "{:,}".format(value)
    else:
        return value


# DATASET ANALYZER FUNCTION

def analyze_dataset(file_name):
    # Determine file format based on file extension
    file_extension = file_name.split('.')[-1].lower()

    # Read the data file
    if file_extension == 'csv':
        df = pd.read_csv(file_name)
    elif file_extension == 'xlsx':
        df = pd.read_excel(file_name)
    elif file_extension == 'json':
        df = pd.read_json(file_name)
    else:
        raise ValueError(f"Unsupported file format: {file_extension}. Only CSV, XLSX, and JSON files are supported.")

    # Get dataset information
    num_columns = len(df.columns)
    num_rows = len(df)
    duplicate_rows = (df.duplicated(keep=False)).sum()
    memory_usage = df.memory_usage().sum()

    # Create dataset table
    dataset_info = pd.DataFrame({
        'Total Columns': [num_columns],
        'Total Rows': [num_rows],
        'Duplicate Rows': [duplicate_rows],
        'Memory Usage': [memory_usage]
    })

    # Get column information
    columns = df.columns.tolist() # Column Names
    data_types = df.dtypes.tolist() # Datatypes
    null_counts = df.isnull().sum().tolist() # NULL count
    na_counts = df.isna().sum().tolist() # NA count

    numeric_columns = df.select_dtypes(include=np.number).columns
    mean_values = df[numeric_columns].mean().round(1).tolist() # Average
    max_values = df[numeric_columns].max().round(1).tolist() # Maximum
    min_values = df[numeric_columns].min().round(1).tolist() # Minimum

    unique_counts = df.nunique().tolist() # Unique values count

    # Create DataFrame with column information
    column_info = pd.DataFrame({
        'Column Name': columns,
        'Data Type': data_types,
        'Null Count': null_counts,
        'NA Count': na_counts,
        'Unique Count': unique_counts,
        'Mean': np.nan,
        'Max': np.nan,
        'Min': np.nan
    })

    # Append mean, max, and min for Numeric DataFrame columns
    column_info.loc[column_info['Column Name'].isin(numeric_columns), 'Mean'] = mean_values
    column_info.loc[column_info['Column Name'].isin(numeric_columns), 'Max'] = max_values
    column_info.loc[column_info['Column Name'].isin(numeric_columns), 'Min'] = min_values

    # Count rows with whitespace
    whitespace_count = count_rows_with_whitespace(df)

    # Count rows with special characters
    special_characters_count = count_special_characters(df)

    # Count rows with break lines
    break_line_count = count_rows_with_breaks(df)

    # Append additional columns to the DataFrame
    column_info['Special Characters Count'] = special_characters_count
    column_info['Whitespace Count'] = whitespace_count
    column_info['Break Line Count'] = break_line_count

    # Dataset views
    head_table = df.head()
    tail_table = df.tail()
    sample_table = df.sample(10)

    # Display Report of All Outputs
    display(HTML(f'<div style="text-align: center;"><h1 style="font-size: 22px; display: inline;">\nDATASET EXPLORED: </h1> <span style="font-size: 16px;">{file_name}</span></div>\n'))
    
    display(HTML('<h2 styple="font-size: 14px;">\n\nDATASET OVERVIEW:</h2>'))
    formatted_info = dataset_info.applymap(format_numbers)
    display(HTML(formatted_info.to_html(index=False)))

    display(HTML('<h2 styple="font-size: 14px;">\nDATASET SUMMARY:</h2>'))
    formatted_info = column_info.applymap(format_numbers)
    display(HTML(formatted_info.to_html(index=True)))
    
    display(HTML('<h2 styple="font-size: 14px;">\nDATASET HEAD:</h2>'))
    display(HTML(head_table.to_html(index=False)))
    display(HTML('<h2 styple="font-size: 14px;">\n\nDATASET TAIL:</h2>'))
    display(HTML(tail_table.to_html(index=False)))
    display(HTML('<h2 styple="font-size: 14px;">\n\nDATASET SAMPLE:</h2>'))
    display(HTML(sample_table.to_html(index=False)))


# EXECUTE DATASET ANALYZER FUNCTION

analyze_dataset(file_name)




Total Columns,Total Rows,Duplicate Rows,Memory Usage
6,500036,0,24001856


Unnamed: 0,Column Name,Data Type,Null Count,NA Count,Unique Count,Mean,Max,Min,Special Characters Count,Whitespace Count,Break Line Count
0,date,object,0,0,475394,,,,500036,0,0
1,id,object,6,6,500007,,,,0,0,0
2,content,object,6,6,493744,,,,479965,0,240030
3,username,object,34,34,250006,,,,0,0,0
4,like_count,float64,62,62,1066,7.1,64094.0,0.0,0,0,0
5,retweet_count,float64,62,62,489,1.5,16080.0,0.0,0,0,0


date,id,content,username,like_count,retweet_count
2023-03-29 22:58:21+00:00,1641213230730051584,"Free AI marketing and automation tools, strategies, and collaboration launching new week https://t.co/Qwti8LfBpb #ChatGPT",RealProfitPros,0.0,0.0
2023-03-29 22:58:18+00:00,1641213218520481805,@MecoleHardman4 Chat GPT says it’s 15. 😂,AmyLouWho321,0.0,0.0
2023-03-29 22:57:53+00:00,1641213115684536323,"https://t.co/FjJSprt0te - Chat with any PDF!\nCheck out how this new AI quickly answers questions from your PDFs.\nPerfect for students, researchers, and other curious minds. \n#research #chatpdf #ChatGPT",yjleon1976,0.0,0.0
2023-03-29 22:57:52+00:00,1641213110915571715,"AI muses: ""In the court of life, we must all face the judge of destiny and the jury of our actions. ⚖️🔮 #OutOfContextAI #AILifeLessons #ChatGPT",ChatGPT_Thinks,0.0,0.0
2023-03-29 22:57:26+00:00,1641213003260633088,"Most people haven't heard of Chat GPT yet.\nFirst, elite factions will decide which way to go on AI safety. Next they will push their agenda(s) on the public with misleading and oversimplified media presentations. Finally, the brainless Red and Blue camps will screech their lines",nikocosmonaut,0.0,0.0


date,id,content,username,like_count,retweet_count
2023-01-04 07:18:08+00:00,1610536038094757888,@GoogleAI #LAMDA Versus @OpenAI #ChatGPT ?! Who cares? Lamda isn't available. ChatGPT is. What's my reason to be interested in comparing the two? Lamda is fiction! There's no good evidence it exists! Lamda is as fake as Commander Data! https://t.co/g0SlTAJIs1 via @Marktechpost,Pup_In_Cup,1.0,0.0
2023-01-04 07:17:50+00:00,1610535961670172674,#ChatGPT \n\nSo much #Censorship.\n\nNever trust a system you don't admin. https://t.co/nlcr0FUeDx,TryingToOffend,2.0,0.0
2023-01-04 07:17:20+00:00,1610535837363486720,all my twitter feed is about ChatGPT and @OpenAI lol 😆\n\n#AI #ChatGPT,mcp350,3.0,1.0
2023-01-04 07:17:08+00:00,1610535786017091584,I'm quite amazed by Chat GPT. A really promising person 😄and Q mark for Google,manumurali369,1.0,0.0
2023-01-04 07:16:56+00:00,1610535734758219778,I used chat gpt to get gym workout program and it was so good. Sore af rn!,pnik91,0.0,0.0


date,id,content,username,like_count,retweet_count
2023-01-30 11:12:52+00:00,1620017194158682112,"Should Colleges,Schools and Universities ban using of #ChatGPT ?\n\n#OpenAI #OpenAIChatGPT",Opinionmint,0.0,0.0
2023-02-08 11:27:04+00:00,1623282260773597185,"#ChatGPT 3/6 - \nHow does it do this? By training on a massive #dataset of books, articles, and #websites through unsupervised learning. This enables ChatGPT to recognize patterns and generate text that is eerily similar to text written by humans.",TexalaIndia,1.0,0.0
2023-01-07 22:07:13+00:00,1611846946334859265,"@diane_levitt @jnxyz @NYCSchools that could be creatively fun, #chatgpt for homework, reflect on the experience in class, a flipped model ;)",chrisdavisLens,2.0,0.0
2023-03-27 18:17:28+00:00,1640417769303977984,My favorite #chatgpt magic trick.\nGarbage In &gt; Structured Out 🪄 https://t.co/ldzqSG2OK4,vyrotek,7.0,0.0
2023-03-14 15:12:58+00:00,1635660298232254467,@LickshotLippy Fredo saying 'u need to come here' to a girl is like backroad gee speaking a language we can understand...these some chat gpt ass dm's,Himmy_Crooks,1.0,0.0
2023-03-29 13:37:52+00:00,1641072181831426049,".@nabe_econ panel on the #debt w/@MayaMacGuineas and Doug Elmendorf has veered into a conversation about #ChatGPT, their recent experimental uses and how it impacts long-term policy decisions. \n@PIIE https://t.co/qQWARgpBXl",ConstanceHunter,2.0,0.0
2023-03-17 13:23:21+00:00,1636719876055216131,How You Can Benefit Using Chat GPT https://t.co/UPLA3XpJM1,876dennis,0.0,0.0
2023-02-04 13:26:39+00:00,1621862801743437825,Here’s a list of the countries #ChatGPT wouldn’t joke about \n@OpenAI why tho? \n#bias #Politics https://t.co/L7SjGCm7bB,DANChat_app,0.0,0.0
2023-02-02 14:07:27+00:00,1621148295384150016,"ChatGPT is switching to a freemium model, with a new paid tier, priced at $20 per month, which will give users better access to the tool.\n\n#AI #ChatGPT #freemium #web3 #ArtificialIntelligence \n\nhttps://t.co/YV4mcbWeAD",PonyWeb3,0.0,0.0
2023-01-22 17:59:31+00:00,1617220427881017345,I asked #ChatGPT to Show me day wise calculation if $100 is doubled every 10 days till 100 days and here it responded.\n\nDay 1: $100\nDay 11: $200\nDay 21: $400\nDay 31: $800\nDay 41: $1600\nDay 51: $3200\nDay 61: $6400\nDay 71: $12800\nDay 81: $25600\nDay 91: $51200\nDay 100: $102400\nWooh https://t.co/CV7eneSxRk,Crypto_sniperr,0.0,0.0
