In [1]:
# Import some libraries
import numpy as np  # useful for many scientific computing in Python
import pandas as pd # primary data structure library

In [2]:
# Function to clean up the chat string
def cleanup(input):
  retVal = ""                                         # Start with a blank string
  for char in input:                                  # Loop through each character
    if char in ["\xa0"]:                              # Is it the unicode space?
      retVal += " "                                   # Yes, then replace with an actual space
    elif char not in ["\u202a", "\u202c", "\u200e"]:  # It is some other weird character?
      retVal += char                                  # No, then append to the final string
  return retVal                                       # Return the cleaned string

In [3]:
# Load the chat logs and process it line by line
filename='chat.txt'

# We use a set here because we only want unique phone numbers. (That's what a set does.)
phones = set()

# Now open the file and read it
with open(filename, 'r') as file:                         # Open the file in read-only mode
  while line := file.readline():                          # Loop though the file line by line
    if "joined" in line:                                  # If the line contains the string, use it
      split = line.split(":", 4)                          # Split up to the 4th ":"
      phone_no = split[-1].split("joined")[0].strip()     # Split the last element again at "joined" and strip any whitespaces
      final = cleanup(phone_no)                           # Clean up the phone number
      if final[0] == '+':                                 # Does it begin with a "+"?
        phones.add(final)                                 # Yes, assume it's a phone number and save it

In [4]:
# Output the number of values
len(phones)

116

In [5]:
df = pd.DataFrame(list(phones))           # Convert the phone set ==> list ==> Pandas DataFrame
df.columns = ['Code']                     # Set the first column to "Code"
df.sort_values('Code', inplace=True)      # Sort the codes in place
df.reset_index(inplace=True, drop=True)   # Reset the index
df.head()                                 # Display partial results

Unnamed: 0,Code
0,+1 (201) 247‑1187
1,+1 (201) 725‑8303
2,+1 (203) 727‑2789
3,+1 (206) 422‑3828
4,+1 (214) 326‑7221


In [6]:
# Function to extract the country code or US/Canada area code
def getCode(phone):
  if "(" in phone:                                        # Does input string have "("
    return phone[phone.index("(") + 1:phone.index(")")]   # Yes, return the area code in between the parenthesis
  return phone[:phone.index(" ")]                         # No, return the country code, including the "+", before the first space

In [7]:
# Test the function
code = '+223 71 41 19 05'
print("The code for '{}' is '{}'".format(code, getCode(code)))
code = '+1 (201) 247‑1187'
print("The code for '{}' is '{}'".format(code, getCode(code)))


The code for '+223 71 41 19 05' is '+223'
The code for '+1 (201) 247‑1187' is '201'


In [8]:
# Apply the function to each row and save it back to the original column
df['Code'] = df.apply(lambda x: getCode(x['Code']), axis=1)

In [9]:
# Check a few things
df.dtypes

Code    object
dtype: object

In [10]:
df.shape

(116, 1)

In [11]:
df['Code'].value_counts()

+52     13
703      6
+971     5
+57      5
+966     3
        ..
770      1
781      1
203      1
858      1
+974     1
Name: Code, Length: 74, dtype: int64

In [12]:
# Save the data
df.to_csv("chat_area_codes_from_members.csv", index=False)