# Fetches all text content from your LingQ courses and saves it in a tree of directories on your Google Drive.

You need to provide your lingq api key when prompted which you can get from here:

https://www.lingq.com/en/accounts/apikey/


In [None]:
# @title Select a directory name for the base directory below which tree will be created
# @markdown

base_folder_name = 'lingq_texts'  # @param {type: "string"}
# @markdown ---


You can run this script to only fetch new content by pointing it at any already existing tree. If files already exist at the location in the tree for a particular lesson then that lesson content will be ignored but any new lessons will be added to the tree.

To run the script select "Run all" from the "Runtime" menu above.

You can see a video of files being created in my drive from my lingq content: https://youtu.be/Tdr6uoF58RM

# Initial set up


In [None]:
#Using Google colab secrets to store keys.
from google.colab import userdata

import urllib.parse

import requests

import json

lingq_api_key=userdata.get('LINGQ_API_KEY');
headers = {
    'Authorization': f'Token {lingq_api_key}',
    'Content-Type': 'application/json'
}


# Helper Functions

In [None]:
def get_json_response (url):
  response = requests.get(f'{url}?page_size=1000', headers=headers)
  return response.json()

def print_json (json_parsed):
  print (json.dumps(json_parsed, indent = 4))


# Languages


In [None]:
languages = get_json_response('https://www.lingq.com/api/v2/languages/')
languages_with_known_words = [item["code"] for item in languages if item["knownWords"] != 0]
print_json(languages_with_known_words)



# The meat of the sandwich

In [None]:
# Install PyDrive2
!pip install PyDrive2

# Import necessary libraries
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os
import urllib.parse

# Authenticate and create the PyDrive2 client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


# Create or find the base folder
base_folder = None
folder_list = drive.ListFile({'q': "title='" + base_folder_name + "' and mimeType='application/vnd.google-apps.folder' and trashed=false"}).GetList()
if len(folder_list) == 0:
    base_folder = drive.CreateFile({'title': base_folder_name, 'mimeType': 'application/vnd.google-apps.folder'})
    base_folder.Upload()
else:
    base_folder = folder_list[0]

# Iterate through languages_with_known_words
for language_code in languages_with_known_words:
    print(f'Fetching language {language_code}')
    courses_in_language = get_json_response(f'https://www.lingq.com/api/v2/{language_code}/collections/my/')

    # Create or find the language folder within the base folder
    language_folder_name = language_code
    language_folder = None
    folder_list = drive.ListFile({'q': f"title='{urllib.parse.quote(language_folder_name)}' and '{base_folder['id']}' in parents and trashed=false"}).GetList()
    if len(folder_list) == 0:
        language_folder = drive.CreateFile({'title': language_folder_name, 'parents': [{'id': base_folder['id']}], 'mimeType': 'application/vnd.google-apps.folder'})
        language_folder.Upload()
    else:
        language_folder = folder_list[0]

    print_json(courses_in_language)

    for course in courses_in_language["results"]:
        course_title = course["title"]
        course_id = course["id"]
        print(f'Fetching course {course_title} - {course_id}')

        # Encode the course name before including it in the URL
        encoded_course_title = urllib.parse.quote(course_title)
        lessons_in_course = get_json_response(f'https://www.lingq.com/api/v2/{language_code}/collections/{course_id}')

        # Create or find the course folder within the language folder
        course_folder_name = course_title
        course_folder = None
        folder_list = drive.ListFile({'q': f"title='{urllib.parse.quote(course_folder_name)}' and '{language_folder['id']}' in parents and trashed=false"}).GetList()
        if len(folder_list) == 0:
            course_folder = drive.CreateFile({'title': course_folder_name, 'parents': [{'id': language_folder['id']}], 'mimeType': 'application/vnd.google-apps.folder'})
            course_folder.Upload()
        else:
            course_folder = folder_list[0]

        print_json(lessons_in_course)

        for lesson in lessons_in_course["lessons"]:
            lesson_title = lesson["title"]
            lesson_url = lesson["url"]
            lesson_json = get_json_response(lesson_url)

            if "tokenizedText" in lesson_json:
                sentences = [sentence[0]["text"] for sentence in lesson_json["tokenizedText"] if not ("opentag" in sentence[0]["tokens"][0])]
                text = " ".join(sentences)

                # Check if the file already exists within the course folder
                existing_files = drive.ListFile({'q': f"title='{lesson_title}.txt' and '{course_folder['id']}' in parents and trashed=false"}).GetList()

                if not existing_files:
                    # Create and write the text content to a text file
                    lesson_file = drive.CreateFile({'title': f'{lesson_title}.txt', 'parents': [{'id': course_folder['id']}]})
                    lesson_file.SetContentString(text)  # Write text content to the file
                    lesson_file.Upload()  # Upload the file
                else:
                    print(f'File {lesson_title}.txt already exists in {course_folder_name}')