# Load the packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from tqdm import tqdm
%matplotlib inline
sns.set(font_scale=1.5, style = 'whitegrid', color_codes=True)
import time
import os
import pickle
import concurrent.futures
from geopy.geocoders import Nominatim

In [2]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
cache_path = ('/content/drive/My Drive/Colab Notebooks/CHATGPT/geocode_cache.pickle')
file_path = ('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA.csv')

In [5]:
df = pd.read_csv(file_path)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255501 entries, 0 to 1255500
Data columns (total 9 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   UserId           1255501 non-null  float64
 1   UserName         1255501 non-null  object 
 2   Verified         1255501 non-null  bool   
 3   Location         896371 non-null   object 
 4   Followers        1255501 non-null  float64
 5   Tweet            1255501 non-null  object 
 6   Language         1255501 non-null  object 
 7   Date             1255501 non-null  object 
 8   Time_of_the_day  1255501 non-null  object 
dtypes: bool(1), float64(2), object(6)
memory usage: 77.8+ MB


In [7]:
df.shape

(1255501, 9)

# **Data Wrangling and EDA**

## **Get country information from the location column**

In [8]:
# fill the na in Location column with "Unknown"
df['Location'].fillna(value='Unknown',inplace=True)

# Assign unkown location's country to "Unknonwn"
df.loc[df['Location']=='Unknown','Country'] = 'Unknown'

In [9]:
df_country = df[df['Country']!='Unknown'].copy()   #get rows with valid location and needs to find the country

In [10]:
df_country.shape 

(896309, 10)

In [11]:
df_country.Country.value_counts()

Series([], Name: Country, dtype: int64)

The geocode progress can be a very slow process, showed an estimate of over 1hour for processing 10000 items. To improve the speed of process:


- cache the results
- batch process
- parallel processing

In [24]:
#create a geolocation object
locator = Nominatim(user_agent = 'myGeocoder22')

#load or create cache for geocode results
cache_path = ('/content/drive/My Drive/Colab Notebooks/CHATGPT/geocode_cache.pickle')
if os.path.exists(cache_path):
    with open(cache_path, 'rb') as f:
        geocode_cache = pickle.load(f)
else:
    geocode_cache = {}
    geocode_cache['Unknown'] = 'Unknown'

print(len(geocode_cache))

32588


In [8]:
# define a function to extract the country fromm the location column
def get_country(location):
    # check if geocode result is in cache
      if location in geocode_cache:
          return geocode_cache[location]
      else:
          try:
              loc = locator.geocode(location)
              if loc:
                  country = loc.raw['display_name'].split(',')[-1].strip()
                  # Add location to cache if not in cache
                  geocode_cache[location] = country
                  return country
              else:
                  return "Unknown"
          except:
              return "Unknown"

In [10]:
#define a function to process locations in batches using parallel processing
def process_locations(df, max_workers, batch_size=100):
    locations = df['Location'].to_list()
    n = len(locations)
    country_list = ['Unknown']*n
    
    #split the locations into batches (tuples, with start index)
    batches = [(i, locations[i:i+batch_size]) for i in range(0, n, batch_size)]
    
    #process each batch in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers = max_workers) as executor:
        future_to_batch = {executor.submit(process_batch, idx, batch):batch for idx, batch in batches}
        for future in tqdm(concurrent.futures.as_completed(future_to_batch), total=len(future_to_batch)):
            start_index, batch_result = future.result()
            for i, country in enumerate(batch_result):
                idx = i + start_index
                if idx <n:
                    country_list[idx] = country
    #store the geocode cache to file
    with open(cache_path, 'wb') as f:
        pickle.dump(geocode_cache, f)
    
    return country_list


In [11]:
#define a function to process a single batch of locations
def process_batch(start_idx, batch):
    batch_result = [get_country(location) for location in batch]
    return start_idx, batch_result

In [16]:
df_1 = df_country.loc[0:10000,:].copy()
df_1['Country'].value_counts(), len(geocode_cache)

(Series([], Name: Country, dtype: int64), 2354)

In [17]:
df_1['Country'] = process_locations(df_1, max_workers=10)

100%|██████████| 73/73 [03:57<00:00,  3.25s/it]


In [18]:
print(f"unknown rate = {(df_1['Country']=='Unknown').sum()/df_1.shape[0]*100:.2f}%")

unknown rate = 30.66%


In [19]:
df_country.loc[0:10000, 'Country'] = df_1['Country']

In [20]:
df_2 = df_country.loc[10000:20000,:].copy()
df_2['Country'].value_counts(), len(geocode_cache)

(Unknown    1
 Name: Country, dtype: int64, 2459)

In [21]:
df_2['Country'] = process_locations(df_2, max_workers=10)

100%|██████████| 72/72 [04:28<00:00,  3.73s/it]


In [22]:
unknown = (df_2['Country']=='Unknown').sum()
total = df_2.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 2485, Total: 7148, Unknown rate: 34.76%


In [23]:
df_country.loc[10000:20000, 'Country'] = df_2['Country']

In [24]:
df_3 = df_country.loc[20000:30000,:].copy()
df_3['Country'].value_counts(), len(geocode_cache)

(Unknown    1
 Name: Country, dtype: int64, 2481)

In [25]:
df_3['Country'] = process_locations(df_3, max_workers=10)

100%|██████████| 72/72 [05:03<00:00,  4.21s/it]


In [26]:
unknown = (df_3['Country']=='Unknown').sum()
total = df_3.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 2674, Total: 7187, Unknown rate: 37.21%


In [27]:
df_country.loc[20000:30000, 'Country'] = df_3['Country']

In [28]:
df_country.to_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country.csv')

In [29]:
df_4 = df_country.loc[30000:50000,:].copy()
df_4['Country'].value_counts(), len(geocode_cache)

(Series([], Name: Country, dtype: int64), 2702)

In [30]:
df_4['Country'] = process_locations(df_4, max_workers=10)

100%|██████████| 141/141 [11:03<00:00,  4.71s/it]


In [31]:
unknown = (df_4['Country']=='Unknown').sum()
total = df_4.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 6190, Total: 14048, Unknown rate: 44.06%


In [32]:
df_country.loc[30000:50000, 'Country'] = df_4['Country']

In [33]:
df_5 = df_country.loc[50000:80000,:].copy()
df_5['Country'].value_counts(), len(geocode_cache)

(Unknown    1
 Name: Country, dtype: int64, 3306)

In [34]:
df_5['Country'] = process_locations(df_5, max_workers=10)

100%|██████████| 210/210 [17:20<00:00,  4.95s/it]


In [35]:
unknown = (df_5['Country']=='Unknown').sum()
total = df_5.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 9848, Total: 20981, Unknown rate: 46.94%


In [36]:
df_country.loc[50000:80000, 'Country'] = df_5['Country']

In [37]:
df_6 = df_country.loc[80000:100000,:].copy()
df_6['Country'].value_counts(), len(geocode_cache)

(France    1
 Name: Country, dtype: int64, 4079)

In [38]:
df_6['Country'] = process_locations(df_6, max_workers=10)

100%|██████████| 137/137 [11:10<00:00,  4.89s/it]


In [39]:
unknown = (df_6['Country']=='Unknown').sum()
total = df_6.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 6477, Total: 13639, Unknown rate: 47.49%


In [40]:
df_country.loc[80000:100000, 'Country'] = df_6['Country']

In [41]:
df_7 = df_country.loc[100000:150000,:].copy()
df_7['Country'].value_counts(), len(geocode_cache)

(Schweiz/Suisse/Svizzera/Svizra    1
 Name: Country, dtype: int64, 4297)

In [42]:
df_7['Country'] = process_locations(df_7, max_workers=10)

100%|██████████| 351/351 [28:48<00:00,  4.92s/it]


In [43]:
unknown = (df_7['Country']=='Unknown').sum()
total = df_7.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 16768, Total: 35078, Unknown rate: 47.8%


In [44]:
df_country.loc[100000:150000, 'Country'] = df_7['Country']

In [53]:
df_country_2 = df_country[['UserName', 'Location', 'Country']].copy()

In [55]:
df_country_2.to_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country_sim.csv')

In [56]:
df_8 = df_country_2.loc[150000:200000,:].copy()
df_8['Country'].value_counts(), len(geocode_cache)

(Unknown    1
 Name: Country, dtype: int64, 5466)

In [57]:
df_8['Country'] = process_locations(df_8, max_workers=10)

100%|██████████| 358/358 [24:35<00:00,  4.12s/it]


In [58]:
unknown = (df_8['Country']=='Unknown').sum()
total = df_8.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 14225, Total: 35798, Unknown rate: 39.74%


In [59]:
df_country_2.loc[150000:200000, 'Country'] = df_8['Country']

In [60]:
df_9 = df_country_2.loc[200000:300000,:].copy()
df_9['Country'].value_counts(), len(geocode_cache)

(Series([], Name: Country, dtype: int64), 6612)

In [68]:
df_9['Country'] = process_locations(df_9, max_workers=10)

100%|██████████| 716/716 [44:09<00:00,  3.70s/it]


In [69]:
unknown = (df_9['Country']=='Unknown').sum()
total = df_9.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 25003, Total: 71522, Unknown rate: 34.96%


In [70]:
df_country_2.loc[200000:300000, 'Country'] = df_9['Country']

In [71]:
df_country_2.to_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country_sim.csv')

In [72]:
df_10 = df_country_2.loc[300000:400000,:].copy()
df_10['Country'].value_counts(), len(geocode_cache)

(Series([], Name: Country, dtype: int64), 9185)

In [73]:
df_10['Country'] = process_locations(df_10, max_workers=10)

100%|██████████| 710/710 [41:51<00:00,  3.54s/it]


In [74]:
unknown = (df_10['Country']=='Unknown').sum()
total = df_10.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 23785, Total: 70977, Unknown rate: 33.51%


In [75]:
df_country_2.loc[300000:400000, 'Country'] = df_10['Country']

In [93]:
df_country_2.to_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country_sim.csv')

In [77]:
df_temp = df_country_2.loc[400000:500000,:].copy()
df_temp['Country'].value_counts(), len(geocode_cache)

(Series([], Name: Country, dtype: int64), 11295)

In [79]:
df_temp['Country'] = process_locations(df_temp, max_workers=10)

100%|██████████| 718/718 [39:56<00:00,  3.34s/it]


In [80]:
unknown = (df_temp['Country']=='Unknown').sum()
total = df_temp.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 23526, Total: 71782, Unknown rate: 32.77%


In [81]:
df_country_2.loc[400000:500000, 'Country'] = df_temp['Country']

In [85]:
df_temp = df_country_2.loc[500000:600000,:].copy()
df_temp['Country'].value_counts(), len(geocode_cache)

(Unknown    1
 Name: Country, dtype: int64, 13310)

In [86]:
df_temp['Country'] = process_locations(df_temp, max_workers=10)

100%|██████████| 723/723 [37:10<00:00,  3.09s/it]


In [87]:
unknown = (df_temp['Country']=='Unknown').sum()
total = df_temp.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 21432, Total: 72223, Unknown rate: 29.67%


In [88]:
df_country_2.loc[500000:600000, 'Country'] = df_temp['Country']

In [89]:
df_temp = df_country_2.loc[600000:700000,:].copy()
df_temp['Country'].value_counts(), len(geocode_cache)

(Series([], Name: Country, dtype: int64), 15136)

In [90]:
df_temp['Country'] = process_locations(df_temp, max_workers=10)

100%|██████████| 698/698 [35:05<00:00,  3.02s/it]


In [91]:
unknown = (df_temp['Country']=='Unknown').sum()
total = df_temp.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 20307, Total: 69776, Unknown rate: 29.1%


In [92]:
df_country_2.loc[600000:700000, 'Country'] = df_temp['Country']

In [95]:
df_temp = df_country_2.loc[700000:800000,:].copy()
df_temp['Country'].value_counts(), len(geocode_cache)

(Argentina    1
 Name: Country, dtype: int64, 16825)

In [98]:
df_temp['Country'] = process_locations(df_temp, max_workers=10)

100%|██████████| 708/708 [30:56<00:00,  2.62s/it]


In [99]:
unknown = (df_temp['Country']=='Unknown').sum()
total = df_temp.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 18339, Total: 70710, Unknown rate: 25.94%


In [100]:
df_country_2.loc[700000:800000, 'Country'] = df_temp['Country']

In [104]:
df_temp = df_country_2.loc[800000:1000000,:].copy()
df_temp['Country'].value_counts(), len(geocode_cache)

(Series([], Name: Country, dtype: int64), 19505)

In [106]:
df_temp['Country'] = process_locations(df_temp, max_workers=10)

100%|██████████| 1426/1426 [1:07:33<00:00,  2.84s/it]


In [107]:
unknown = (df_temp['Country']=='Unknown').sum()
total = df_temp.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 39672, Total: 142582, Unknown rate: 27.82%


In [108]:
df_country_2.loc[800000:1000000, 'Country'] = df_temp['Country']

In [109]:
df_country_2.to_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country_sim.csv')

In [111]:
df_temp = df_country_2.loc[1000000:1300000,:].copy()
df_temp['Country'].value_counts(), len(geocode_cache)

(Series([], Name: Country, dtype: int64), 22299)

In [113]:
df_temp['Country'] = process_locations(df_temp, max_workers=10)

100%|██████████| 1857/1857 [1:29:31<00:00,  2.89s/it]


In [114]:
unknown = (df_temp['Country']=='Unknown').sum()
total = df_temp.shape[0]
unknown_rate = round(unknown/total*100,2)
print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")

Unknown: 53575, Total: 185628, Unknown rate: 28.86%


In [116]:
df_country_2.loc[1000000:1300000, 'Country'] = df_temp['Country']

In [118]:
df_country_2.to_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country_sim.csv')

In [124]:
df_country_3 = df_country_2[df_country_2['Country']=='Unknown'].copy()

In [12]:
def final_run(df):
  df_temp = df.copy()
  df_temp['Country'] = process_locations(df_temp, max_workers=10)
  unknown = (df_temp['Country']=='Unknown').sum()
  total = df_temp.shape[0]
  unknown_rate = round(unknown/total*100,2)
  print(f"Unknown: {unknown}, Total: {total}, Unknown rate: {unknown_rate}%")
  df['Country'] = df_temp['Country']
  df.to_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country_sim_2.csv')

In [20]:
df_country_2=pd.read_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country_sim.csv')
df_country_2 = df_country_2.set_index('Unnamed: 0')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [22]:
df_country_3 = df_country_2[df_country_2['Country']=='Unknown']
df_country_3

Unnamed: 0_level_0,UserName,Location,Country
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5,stoosepp,"Australia, for now",Unknown
6,kempes76,København,Unknown
7,AndrewScully1,"Avalon Beach, Sydney",Unknown
12,knelsonvsi,"California, Riverside",Unknown
19,dannypostmaa,Get a new profile picture →,Unknown
...,...,...,...
1255491,Coscorrodrift,prolly on youtube,Unknown
1255494,lingzhong_eth,⛓,Unknown
1255498,lingzhong_eth,⛓,Unknown
1255499,bitchrate,kha\vivian \\ it\its,Unknown


In [23]:
for i in range(10000, 300000, 10000):
  final_run(df_country_3[i-10000:i])
  print(f'{i-10000}-{i}done!')

100%|██████████| 100/100 [10:40<00:00,  6.41s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Country'] = df_temp['Country']


Unknown: 6087, Total: 10000, Unknown rate: 60.87%
0-10000done!


100%|██████████| 100/100 [11:39<00:00,  6.99s/it]


Unknown: 6429, Total: 10000, Unknown rate: 64.29%
10000-20000done!


100%|██████████| 100/100 [13:02<00:00,  7.82s/it]


Unknown: 7297, Total: 10000, Unknown rate: 72.97%
20000-30000done!


100%|██████████| 100/100 [14:09<00:00,  8.49s/it]


Unknown: 7876, Total: 10000, Unknown rate: 78.76%
30000-40000done!


100%|██████████| 100/100 [14:09<00:00,  8.49s/it]


Unknown: 8078, Total: 10000, Unknown rate: 80.78%
40000-50000done!


100%|██████████| 100/100 [13:43<00:00,  8.24s/it]


Unknown: 7646, Total: 10000, Unknown rate: 76.46%
50000-60000done!


100%|██████████| 100/100 [12:45<00:00,  7.65s/it]


Unknown: 7122, Total: 10000, Unknown rate: 71.22%
60000-70000done!


100%|██████████| 100/100 [12:24<00:00,  7.45s/it]


Unknown: 7174, Total: 10000, Unknown rate: 71.74%
70000-80000done!


100%|██████████| 100/100 [13:09<00:00,  7.89s/it]


Unknown: 7486, Total: 10000, Unknown rate: 74.86%
80000-90000done!


100%|██████████| 100/100 [13:00<00:00,  7.80s/it]


Unknown: 7203, Total: 10000, Unknown rate: 72.03%
90000-100000done!


100%|██████████| 100/100 [13:13<00:00,  7.94s/it]


Unknown: 7545, Total: 10000, Unknown rate: 75.45%
100000-110000done!


100%|██████████| 100/100 [13:47<00:00,  8.28s/it]


Unknown: 7763, Total: 10000, Unknown rate: 77.63%
110000-120000done!


100%|██████████| 100/100 [13:20<00:00,  8.00s/it]


Unknown: 7680, Total: 10000, Unknown rate: 76.8%
120000-130000done!


100%|██████████| 100/100 [13:30<00:00,  8.11s/it]


Unknown: 7797, Total: 10000, Unknown rate: 77.97%
130000-140000done!


100%|██████████| 100/100 [13:31<00:00,  8.12s/it]


Unknown: 7758, Total: 10000, Unknown rate: 77.58%
140000-150000done!


100%|██████████| 100/100 [14:13<00:00,  8.53s/it]


Unknown: 8082, Total: 10000, Unknown rate: 80.82%
150000-160000done!


100%|██████████| 100/100 [14:09<00:00,  8.50s/it]


Unknown: 8017, Total: 10000, Unknown rate: 80.17%
160000-170000done!


100%|██████████| 100/100 [14:36<00:00,  8.77s/it]


Unknown: 8425, Total: 10000, Unknown rate: 84.25%
170000-180000done!


100%|██████████| 100/100 [15:37<00:00,  9.37s/it]


Unknown: 8874, Total: 10000, Unknown rate: 88.74%
180000-190000done!


100%|██████████| 100/100 [15:19<00:00,  9.20s/it]


Unknown: 8750, Total: 10000, Unknown rate: 87.5%
190000-200000done!


100%|██████████| 100/100 [14:56<00:00,  8.97s/it]


Unknown: 8452, Total: 10000, Unknown rate: 84.52%
200000-210000done!


100%|██████████| 100/100 [14:52<00:00,  8.93s/it]


Unknown: 8539, Total: 10000, Unknown rate: 85.39%
210000-220000done!


100%|██████████| 100/100 [15:03<00:00,  9.03s/it]


Unknown: 8560, Total: 10000, Unknown rate: 85.6%
220000-230000done!


100%|██████████| 100/100 [04:20<00:00,  2.60s/it]


Unknown: 8932, Total: 10000, Unknown rate: 89.32%
230000-240000done!


100%|██████████| 100/100 [01:20<00:00,  1.25it/s]


Unknown: 9060, Total: 10000, Unknown rate: 90.6%
240000-250000done!


100%|██████████| 100/100 [01:09<00:00,  1.44it/s]


Unknown: 9210, Total: 10000, Unknown rate: 92.1%
250000-260000done!


100%|██████████| 100/100 [01:10<00:00,  1.42it/s]


Unknown: 9277, Total: 10000, Unknown rate: 92.77%
260000-270000done!


100%|██████████| 100/100 [01:11<00:00,  1.40it/s]


Unknown: 9422, Total: 10000, Unknown rate: 94.22%
270000-280000done!


100%|██████████| 66/66 [00:49<00:00,  1.35it/s]

Unknown: 6211, Total: 6519, Unknown rate: 95.28%
280000-290000done!





In [157]:
df.loc[df_country_2.index, 'Country'] = df_country_2['Country']

In [4]:
df=pd.read_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
df = df.set_index('Unnamed: 0')

In [52]:
df_filter = df[(df['Location']!='Unknown')&(df['Country']=='Unknown')].copy()

In [53]:
df_filter['Country'] = df_filter['Location'].map(geocode_cache)

In [54]:
df_filter['Country'] = df_filter['Country'].fillna('Unknown')

In [66]:
df.loc[df_filter.index, 'Country'] = df_filter['Country']

In [68]:
df.to_csv('/content/drive/My Drive/Colab Notebooks/CHATGPT/Twitter_EDA_country.csv')

In [69]:
df

Unnamed: 0_level_0,UserId,UserName,Verified,Location,Followers,Tweet,Language,Date,Time_of_the_day,Country
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1620210123732828160.0,Erban_legend_,False,"Chicago, IL",763.0,ChatGPT is basically the best unpaid intern to...,en,2023-01-30,23:59:30,United States
1,1620210123456270336.0,doomoog,False,Unknown,60.0,@jeffersonmorley @ChatGPT @OpenAI It pulled th...,en,2023-01-30,23:59:30,Unknown
2,1620210110240034816.0,yanhaica1,False,"Toronto, on, Canada",1493.0,#ChatGPT is down for me since yesterday. Is it...,en,2023-01-30,23:59:26,Canada
3,1620210048281776128.0,yasegumi,False,Unknown,273.0,@GalorOded ChatGPT is literally the marriage o...,en,2023-01-30,23:59:12,Unknown
4,1620210039570022400.0,datos_digital,False,Australia,6385.0,ChatGPT Is Making Universities Rethink Plagiar...,en,2023-01-30,23:59:10,Australia
...,...,...,...,...,...,...,...,...,...,...
1255496,1598105066753650688.0,goodside,False,"Richmond, VA",40296.0,OpenAI’s new ChatGPT seems to be trained again...,en,2022-12-01,00:01:53,United States
1255497,1598104931248242688.0,1024pixels1,False,Unknown,268.0,では、Pixivでフォロワーを増やす為にはどうすれば良いのですか？\n#ChatGPT ht...,ja,2022-12-01,00:01:21,Unknown
1255498,1598104905029660672.0,lingzhong_eth,False,⛓,167.0,I asked #ChatGPT\n“Is COVID lockdown good or b...,en,2022-12-01,00:01:15,Unknown
1255499,1598104707548987392.0,bitchrate,False,kha\vivian \\ it\its,251.0,chatgpt is really cool but man saying shit to ...,en,2022-12-01,00:00:28,Unknown
