# Casetext Solutions Engineer Coding Challenge



## Steps 
1. Load Packages
2. Downloaad Data
3. Create WeSearch Collection
4. Upload Random Sample to Collection

## 1. Load Packages

In [1]:
# load packages
import requests # to create collection and upload to collection
import zipfile # to retrieve files from zip folder
import random # for randomly selecting files
import config # config.py contains authorozation token

In [6]:
# mount to Google Drive

from google.colab import drive
drive.mount('/content/gdrive', force_remount= True)
# %cd gdrive/MyDrive/Colab\ Notebooks/

Mounted at /content/gdrive


In [8]:
%cd gdrive/MyDrive/Colab\ Notebooks/

/content/gdrive/MyDrive/Colab Notebooks


In [92]:
# Initiate key variables
auth_token = config.auth_token
collection = 'https://project-apollo-api.stg.gc.casetext.com/v0/kayla-collects'
zip_file = '446c8aa0-6eba-11e5-bc7f-4851b79b387c.zip'

## 2. Download Data 

One thing that is important to note is that the files are in a zipped folder. I will be using the **zipfile** package to retrieve the files.

### Retrieve file names

In [11]:
# Create function to retrieve all file names within zip file

file_name_list = [] # list to append file names to

def get_file_names():
  with zipfile.ZipFile(zip_file, 'r') as zip: 
   # Get list of files names in zip
   for name in zip.namelist(): # namelist(): zipfile function that creates list of all files in folder
     file_name_list.append(name) # append file names to list

In [12]:
# Call function to get all file names in folder

get_file_names()

In [None]:
# Check data type
print("file_name_list is a " + str(type(file_name_list)))

# Number of files in zip folder
print("There are " + str(len(file_name_list)) + " files in the zip folder")

file_name_list is a <class 'list'>
There are 266912 files in the zip folder


### Randomly select files




In [13]:
# Create function to randomly select file names from file_name_list

random_file_list = [] # list to append random files names to

def get_random_files(x= 1000): # takes Number to randomly sample as input, 1000 is default
  for file in random.sample(file_name_list,x): # sample() fnc from "random" package
    random_file_list.append(file) # append randomly sampled file name to list

In [14]:
# Call function to randomly select files

get_random_files()

In [15]:
# Number of files in random file list

print("There are " + str(len(random_file_list)) + " randomly chosen files")

There are 1000 randomly chosen files


## 3. Create WeSearch Collection

Now I will use Python - requests and the lawbert model to create a WeSearch collection.

In [None]:
# Create Function tp Create a WeSearch Collection

def create_collection():
  headers = {
    'Authorization': 'Bearer ' + auth_token, # authorization token needed
    'Content-Type': 'application/json',
  }
  data = '{ "model": "lawbert" }' # lawbert is the default model
# data = '{"email":"","password":"[HIDDEN]"}'
  response = requests.post('https://project-apollo-api.stg.gc.casetext.com/v0/kayla-collects/create', headers=headers)

In [None]:
# Call Function to Create a WeSearch Collection
create_collection()

In [None]:
# Checking data
with zipfile.ZipFile(zip_file, 'r') as zip: 
  with zip.open('446c8aa0-6eba-11e5-bc7f-4851b79b387c/-us-judgment-us-2013-02-19-12-8091-main') as myfile:
        print(myfile.read())

b'\n      \n    U.S.\n    U.S. Supreme Court\n  \n12-8091\n\n\n02-19-2013\n\nIN RE WILLIAM STAPLES\n\n      \n        \n        HABEAS CORPUS DENIED\n        The petition for a writ of habeas corpus is denied. Justice Kagan took no part in the consideration or decision of this petition.\n\n      \n    '


## 4. Upload Random Sample to Collection

Now that the collection has been created we can upload thee 1000 documents. Then, we can try some searches on the collection.

In [63]:
# Create function to retrieve text from files of interest
files = []
def get_text():
  with zipfile.ZipFile(zip_file, 'r') as zip: 
    for file in random_file_list:
      with zip.open(file) as myfile:
        files.append(myfile.read())

In [None]:
# Call function to retrieve text
get_text()

In [64]:
# Upload files to Collection
def upload_files():
  headers = {
    'Authorization': 'Bearer ' + auth_token,
    'Content-Type': 'text/plain',
    }
  for data in files:
    response = requests.post(collection, headers=headers, data= data)

In [None]:
# Call function to upload files
upload_files()

In [65]:
response

<Response [201]>

In [80]:
# Check data type
type(response.json())

dict

In [90]:
# Verify succesful ingestion of 1000 files into collection (len)
print("There are " + str(len(response.json()['documents'])) + " documents in the Collection")

There are 1000 documents in the Collection
