# Text Extraction

## Imports

In [3]:
import re
import json
import pandas as pd

## Data Extraction

### All Data

In [4]:
items = []

# open file
with open("data_cleaned.txt") as f:
    lines = [line.rstrip() for line in f] # read all files and remove whitespace
    for line in lines:
        if re.search("Request|Response", line): # Each request/response makes a new dict
            items.append({})
        elif len(line) == 0: 
            pass # skip if there is no content in the line
        else:
            k, v = re.split(":\s", line) # add the key value pair to the list
            items[-1][k.strip()] = v.strip() # Add to dictionary and remove surrounding whitespace

### Extract Discipline

In [5]:
for i in items:
    i['Discipline'] = re.search("edge\/(.*)$", i['Endpoint']).group(1)

### Match Request and Responses

In [6]:
users = []
found = set()
for i in range(len(items)):
    i1 = items[i]
    if (i1['Controller'], i1['Discipline']) in found:
        continue
    for j in range(i+1, len(items)):
        i2 = items[j]
        if i1['Controller'] == i2['Controller'] and i1['Discipline'] == i2['Discipline']:
            users.append({})
            for k, v in i1.items():
                users[-1][k] = v
            for k, v in i2.items():
                users[-1][k] = v
            found.add((i1['Controller'], i1['Discipline']))
            break

### Value Checks

In [7]:
print(
    len(users), users[0]
)

15 {'Address': '178373', 'Red Meat': '100 pounds', 'Controller': 'LocationController', 'Grains': '2 pounds', 'Action': 'Get', 'Endpoint': '(OWL) https://uwo.ca/edge/software', 'Dairy': '200 pounds', 'Cellphone': '55 hours', 'TV': '43 hours', 'Computer': '130 hours', 'New Year Resolution': '5%', 'Discipline': 'software', 'Username': 'Sammy', 'Car': '150 hours', 'Walking': '3 hours', 'Public Transport': '10 hours', 'Status': '4', 'Items/Total': '2/1'}


### Add Conversion Metrics

In [8]:
conversions = {}
with open('conversion_table.json') as f:
    conversions = json.loads(f.read())
conversions

{'public_transport': 4.3,
 'car': 6.5,
 'walking': 0,
 'red_meat': 8.0,
 'dairy': 6.3,
 'grains': 3.7,
 'phone': 3.6,
 'computer': 4.2,
 'tv': 6.8}

In [9]:
for k, v in conversions.items():
    for u in users:
        for k in ['Red Meat', 'Grains', 'Dairy', 'Cellphone', 'TV', 'Computer', 'Car', 'Walking', 'Public Transport']:
            units = int(re.search("^(.*)\s", u[k]).group(1))
            col_name = f"{k} CO2 Emissions"
            u[col_name] = units

## Extract CSV

### JSON To CSV Conversion

In [10]:
df = pd.DataFrame(users)

### Rename Columns

In [11]:
for col in df.columns:
    new_col = col.lower().repalce(" ", "_")
    print(new_col)

AttributeError: 'str' object has no attribute 'repalce'