### Step 1: Upload Files and Uncompress Archives

First, let's import the necessary modules and define a function for uncompressing files.

In [1]:
from google.colab import files
import os, tarfile, zipfile

def uncompress_file(file_name, destination):
    if file_name.endswith('.tar.gz') or file_name.endswith('.tar'):
        with tarfile.open(file_name, 'r:*') as tar:
            tar.extractall(path=destination)
    elif file_name.endswith('.zip'):
        with zipfile.ZipFile(file_name, 'r') as zip_ref:
            zip_ref.extractall(destination)

Now, upload the files and uncompress any archives.

In [2]:
os.makedirs('/content/uncompressed_files', exist_ok=True)

uploaded = files.upload()

for filename in uploaded.keys():
    if filename.endswith(('.tar', '.tar.gz', '.zip')):
        uncompress_file(filename, '/content/uncompressed_files')
    else:
        # Move non-archive files to the uncompressed_files directory
        os.rename(filename, f'/content/uncompressed_files/{filename}')

### Step 2: Access Google Drive Directory and Google Sheets

Mount your Google Drive to access files from it.

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Specify the directory in your Google Drive that you want to access.

In [4]:
# Replace 'your-directory-path' with your specific directory path
sub_dir = "lingq_texts/pl/Daily Polish Story/"  #@param {type:"string"}
drive_directory = '/content/drive/MyDrive/' + sub_dir
drive_directory

'/content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/'

In [5]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)


In [6]:
import spacy

#@title Choose a language model
model = "pl_core_news_lg" #@param ["pl_core_news_lg", "ca_core_news_sm", "da_core_news_sm", "de_core_news_sm", "el_core_news_sm", "en_core_web_sm", "es_core_news_sm", "fi_core_news_sm", "fr_core_news_sm", "hr_core_news_sm", "it_core_news_sm", "ja_core_news_sm", "ko_core_news_sm", "lt_core_news_sm", "mk_core_news_sm", "nb_core_news_sm", "nl_core_news_sm", "pt_core_news_sm", "ro_core_news_sm", "sl_core_news_sm", "sv_core_news_sm", "ru_core_news_sm", "uk_core_news_sm", "xx_ent_wiki_sm", "xx_sent_ud_sm", "zh_core_web_sm"]
!python -m spacy download {model}

spacy.prefer_gpu()

nlp = spacy.load(model)

2023-12-22 19:34:53.702587: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-22 19:34:53.702648: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-22 19:34:53.703577: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting pl-core-news-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.6.0/pl_core_news_lg-3.6.0-py3-none-any.whl (573.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m573.7/573.7 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pl-core-news-lg
Successfully ins

### Step 3: Iterate Through Files and Output Content of .txt Files

Define a function to iterate through files in a directory and print the content of `.txt` files.

In [7]:
# see https://github.com/jamiepratt/lingq/blob/main/spacy_experiments.ipynb for how I calculated this:
PL_morphology_features = ['Animacy', 'Case', 'Gender', 'Number', 'AdpType', 'Variant', 'Aspect', 'Mood', 'Person', 'Tense', 'VerbForm', 'Voice', 'Degree', 'PronType', 'NumForm', 'PrepCase', 'Polarity', 'Reflex', 'NumType', 'Hyph', 'Poss', 'ConjType', 'VerbType', 'Clitic', 'Foreign', 'Number[psor]', 'PartType', 'Abbr', 'Pun', 'Emphatic', 'PunctSide', 'Polite']
data_cols = ["filename", "directory", "token_no", "term", "lemma", "pos", "part of speech", "sentence", "morph"] + PL_morphology_features

In [8]:
def spacy_data_from_txt_files_content(directory):
    data = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as file:
                    print(f"Processing contents of {file_path}:\n")
                    doc = nlp(file.read())
                    for t in doc:
                      t_data = [file.name, root, t.i, t.text, t.lemma_, t.pos_, spacy.explain(t.pos_), t.sent.text, f'{t.morph}']
                      for feat in PL_morphology_features:
                        feat_value = t.morph.get(feat)
                        if feat_value:
                          t_data.append(feat_value[0])
                        else:
                          t_data.append("NOT_SET")
                      data.append(t_data)


    return data

In [9]:
spacy_data = []
spacy_data = spacy_data + spacy_data_from_txt_files_content('/content/uncompressed_files')
spacy_data = spacy_data + spacy_data_from_txt_files_content(drive_directory)

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/001A-Marta próbowała usnąć, ale nie mogła..txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/001B-DailyPolishStory-POV.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/001C-DailyPolishStory-QA.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/002A-"Marek mył swoje ręce".txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/002B-DailyPolishStory-POV.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/002C-DailyPolishStory-QA.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/003A-"Robert mało je.".txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/003B-DailyPolishStory.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/003C-Daily

In [10]:
spacy_data[0]


['/content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/001A-Marta próbowała usnąć, ale nie mogła..txt',
 '/content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/',
 0,
 'Cześć',
 'cześć',
 'INTJ',
 'interjection',
 'Cześć!',
 '',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET',
 'NOT_SET']

In [11]:

# Create a New Google Sheet
sh = gc.create(f'spaCy analysis of {drive_directory}')
# Open the Google Sheet with gspread
worksheet = sh.get_worksheet(0)  # '0' refers to the first sheet

# Update the sheet with DataFrame values
worksheet.update('A1', [data_cols] + spacy_data)

{'spreadsheetId': '1NkcnIjShjRj3nKhX_oaPuVTnnJXi2DLcEpKQ9Syiu4A',
 'updatedRange': 'Sheet1!A1:AO24372',
 'updatedRows': 24372,
 'updatedColumns': 41,
 'updatedCells': 999252}