### Access Google Drive Directory and Google Sheets

Mount your Google Drive to access files from it.

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Specify the directory in your Google Drive that you want to access.

In [9]:
# Replace 'your-directory-path' with your specific directory path
sub_dir = "lingq_texts/pl/Daily Polish Story/"  #@param {type:"string"}
drive_directory = '/content/drive/MyDrive/' + sub_dir
drive_directory

'/content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/'

In [10]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)


In [11]:
import spacy

#@title Choose a language model
model = "pl_core_news_lg" #@param ["pl_core_news_lg", "ca_core_news_sm", "da_core_news_sm", "de_core_news_sm", "el_core_news_sm", "en_core_web_sm", "es_core_news_sm", "fi_core_news_sm", "fr_core_news_sm", "hr_core_news_sm", "it_core_news_sm", "ja_core_news_sm", "ko_core_news_sm", "lt_core_news_sm", "mk_core_news_sm", "nb_core_news_sm", "nl_core_news_sm", "pt_core_news_sm", "ro_core_news_sm", "sl_core_news_sm", "sv_core_news_sm", "ru_core_news_sm", "uk_core_news_sm", "xx_ent_wiki_sm", "xx_sent_ud_sm", "zh_core_web_sm"]
!python -m spacy download {model}

spacy.prefer_gpu()

nlp = spacy.load(model)

2023-12-28 11:12:51.888142: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-28 11:12:51.888207: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-28 11:12:51.889798: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Collecting pl-core-news-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pl_core_news_lg-3.6.0/pl_core_news_lg-3.6.0-py3-none-any.whl (573.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m573.7/573.7 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now

### Step 3: Iterate Through Files and Output Content of .txt Files

Define a function to iterate through files in a directory and print the content of `.txt` files.

In [17]:
# see https://github.com/jamiepratt/lingq/blob/main/spacy_experiments.ipynb for how I calculated this:
data_cols = ["filename", "directory", "token_no", "term", "lemma", "pos", "part of speech", "sentence", "morph", "tags"]

pos_to_skip = ["PUNCT", "CCONJ", "SPACE", "X", "INTJ", "SYM"]

In [20]:
import os

def spacy_data_from_txt_files_content(directory):
    data = []
    for root, dirs, files in os.walk(directory):
        for file_name in files:
            if file_name.endswith('.txt'):
                file_path = os.path.join(root, file_name)
                with open(file_path, 'r') as file:
                    print(f"Processing contents of {file_path}:\n")
                    doc = nlp(file.read())
                    for t in doc:
                      if t.pos_ not in pos_to_skip:
                        explained = spacy.explain(t.pos_)
                        t_data = [file_name, root, t.i, t.text, t.lemma_,
                                  t.pos_, explained, t.sent.text, f'{t.morph}',
                                  " ".join([explained] + f'{t.morph}'.split("|"))]
                        data.append(t_data)

    return data

In [21]:
spacy_data = spacy_data_from_txt_files_content(drive_directory)

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/001A-Marta próbowała usnąć, ale nie mogła..txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/001B-DailyPolishStory-POV.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/001C-DailyPolishStory-QA.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/002A-"Marek mył swoje ręce".txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/002B-DailyPolishStory-POV.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/002C-DailyPolishStory-QA.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/003A-"Robert mało je.".txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/003B-DailyPolishStory.txt:

Processing contents of /content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/003C-Daily

In [22]:
spacy_data[0]


["<_io.TextIOWrapper name='/content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/001A-Marta próbowała usnąć, ale nie mogła..txt' mode='r' encoding='UTF-8'>",
 '/content/drive/MyDrive/lingq_texts/pl/Daily Polish Story/',
 2,
 'Witam',
 'witać',
 'VERB',
 'verb',
 'Witam i zapraszam na pierwszą historyjkę pod tytułem Marta nie może spać.',
 'Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act',
 'verb Aspect=Imp Mood=Ind Number=Sing Person=1 Tense=Pres VerbForm=Fin Voice=Act']

In [23]:

# Create a New Google Sheet
sh = gc.create(f'spaCy analysis of {drive_directory}')
# Open the Google Sheet with gspread
worksheet = sh.get_worksheet(0)  # '0' refers to the first sheet

# Update the sheet with DataFrame values
worksheet.update('A1', [data_cols] + spacy_data)

{'spreadsheetId': '1p90L7lDxJikkzJ2JLeZiLvCdXwro_cZdHzRKJmNkfvM',
 'updatedRange': 'Sheet1!A1:J18783',
 'updatedRows': 18783,
 'updatedColumns': 10,
 'updatedCells': 187830}