In [9]:
# Install required packages
!pip install gensim numpy nltk

# Clone the project repository containing the NFCorpus data and other necessary files for the project.
!git clone https://github.com/cr-nlp/project1-2023.git

# Import necessary libraries
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import os
from os.path import isfile, join

# nltk downloads - run these commands if the resources haven't been downloaded yet
# nltk.download('stopwords')
# nltk.download('punkt')
    
# Define a function to load the NFCorpus data from the cloned GitHub repository.
def loadNFCorpus():
    # Define the directory where the data is located.
    dir = "./project1-2023/"
    # Load the document data which contains abstracts from PubMed.
    filename = dir + "dev.docs"
    
    # Initialize a dictionary to store document data.
    dicDoc = {}
    # Read document lines and split them into a dictionary with key as document ID and value as text.
    with open(filename,encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.split('\t')
        key = tabLine[0]
        value = tabLine[1]
        dicDoc[key] = value
    
    # Load and parse the query data similar to document data.
    filename = dir + "dev.all.queries"
    dicReq = {}
    with open(filename, encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.split('\t')
        key = tabLine[0]
        value = tabLine[1]
        dicReq[key] = value
    
    # Load the relevance judgments which provide a relevance score for document-query pairs.
    filename = dir + "dev.2-1-0.qrel"
    dicReqDoc = defaultdict(dict)
    with open(filename) as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.strip().split('\t')
        req = tabLine[0]
        doc = tabLine[2]
        score = int(tabLine[3])
        dicReqDoc[req][doc] = score
    
    # Return the loaded document and query data along with relevance judgments.
    return dicDoc, dicReq, dicReqDoc

# Load and process NFCorpus data
documents, queries, relevance = loadNFCorpus()

# Define stopwords outside the function to avoid repeated loading
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize and remove stopwords
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2 and word.isalpha()]
    return tokens

def text_to_embedding(text, model):
    # Preprocess and tokenize text
    tokens = preprocess_text(text)  # Make sure you have a preprocess_text function defined
    # Filter tokens based on the model's vocabulary and get embeddings
    embeddings = [model.wv[word] for word in tokens if word in model.wv.key_to_index]
    
    # If at least one token produced an embedding, return the mean vector
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.wv.vector_size)

# Process documents to create 'processed_docs' for Word2Vec training
processed_docs = [preprocess_text(doc) for doc in documents.values()]

# Create the Word2Vec model
model = Word2Vec(processed_docs, vector_size=100, window=5, min_count=1, workers=4)

# Convert a sample query to embedding
sample_query = "What are the latest treatments for diabetes mellitus?"
query_embedding = text_to_embedding(sample_query, model)

# Compute document embeddings and similarities (for demonstration purposes, this is not scalable for large datasets)
document_embeddings = {doc_id: text_to_embedding(doc_content, model) for doc_id, doc_content in documents.items()}
query_doc_similarity = {doc_id: cosine_similarity([query_embedding], [doc_embedding])[0][0] 
                        for doc_id, doc_embedding in document_embeddings.items() if np.any(doc_embedding)}

# Sort documents based on similarity to the sample query
sorted_doc_similarity = sorted(query_doc_similarity.items(), key=lambda item: item[1], reverse=True)

# Print out top 5 relevant documents
print("Top 5 relevant documents for the query:")
for doc_id, similarity in sorted_doc_similarity[:5]:
    print(f"Doc ID: {doc_id}, Similarity: {similarity:.4f}")
    print(f"Title: {documents[doc_id].split('.')[0]}")
    print(f"Content Snippet: {' '.join(documents[doc_id].split(' ')[:50])}...")
    print("")



fatal: destination path 'project1-2023' already exists and is not an empty directory.


Top 5 relevant documents for the query:
Doc ID: MED-1993, Similarity: 0.9641
Title: type num diabetes mellitus children adolescents abstract type num diabetes mellitus emerging clinical problem pediatric practice recent reports increasing prevalence type num diabetes mellitus children adolescents world ethnicities prevalence obesity increasing majority young people diagnosed type num diabetes mellitus found specific ethnic subgroups african-american hispanic asian/pacific islanders american indians clinicians aware frequent mild asymptomatic manifestation type num diabetes mellitus childhood screening meaningful high risk groups children adolescents obesity relatives type num diabetes mellitus clinical features insulin resistance hypertension dyslipidemia polycystic ovarian syndrome acanthosis nigricans treatment choice lifestyle intervention pharmacological treatment e g metformin drugs dipeptidyl peptidase inhibitors glucagon peptide num mimetics pipeline treatment youth type num dia

In [21]:
# Print out 5 example query IDs
print("Five example query IDs and their texts:")
for i, (query_id, query_text) in enumerate(queries.items()):
    print(f"Query ID: {query_id}, Query Text: {query_text}")
    if i == 4:  # Stop after printing 5 examples
        break

Five example query IDs and their texts:
Query ID: PLAIN-1, Query Text: why deep fried foods may cause cancer in the latest study on dietary patterns and breast cancer risk among women , healthier eating was associated with eliminating three-quarters of the odds of breast cancer , whereas less healthy eating was associated with up to nearly eight times the odds . included in the unhealthy eating pattern was the consumption of deep-fried foods , which have previously been linked to breast cancer , pancreatic cancer , lung cancer , oral and throat cancers , esophageal cancer , and cancer of the voicebox . no deep fried foods ? what ’ s a southern belle to do ? instead of deep fried foods , how about the traditional southern diet , characterized by high intakes of cooked greens , beans , legumes , cabbage , sweet potatoes and cornbread , which may reduce the risk of invasive breast cancer significantly . what about the consumption of deep-fried foods and risk of prostate cancer ? researche

In [41]:
# Install required packages
#!pip install gensim numpy nltk

# Clone the project repository containing the NFCorpus data and other necessary files for the project.
#!git clone https://github.com/cr-nlp/project1-2023.git

# Import necessary libraries
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import os
from os.path import isfile, join

# nltk downloads - run these commands if the resources haven't been downloaded yet
# nltk.download('stopwords')
# nltk.download('punkt')
    
# Define a function to load the NFCorpus data from the cloned GitHub repository.
def loadNFCorpus():
    # Define the directory where the data is located.
    dir = "./project1-2023/"
    # Load the document data which contains abstracts from PubMed.
    filename = dir + "dev.docs"
    
    # Initialize a dictionary to store document data.
    dicDoc = {}
    # Read document lines and split them into a dictionary with key as document ID and value as text.
    with open(filename,encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.split('\t')
        key = tabLine[0]
        value = tabLine[1]
        dicDoc[key] = value
    
    # Load and parse the query data similar to document data.
    filename = dir + "dev.all.queries"
    dicReq = {}
    with open(filename, encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.split('\t')
        key = tabLine[0]
        value = tabLine[1]
        dicReq[key] = value
    
    # Load the relevance judgments which provide a relevance score for document-query pairs.
    filename = dir + "dev.2-1-0.qrel"
    dicReqDoc = defaultdict(dict)
    with open(filename) as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.strip().split('\t')
        req = tabLine[0]
        doc = tabLine[2]
        score = int(tabLine[3])
        dicReqDoc[req][doc] = score
    
    # Return the loaded document and query data along with relevance judgments.
    return dicDoc, dicReq, dicReqDoc

# Load and process NFCorpus data
documents, queries, relevance = loadNFCorpus()

# Define stopwords outside the function to avoid repeated loading
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize and remove stopwords
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2 and word.isalpha()]
    return tokens

def text_to_embedding(text, model):
    # Preprocess and tokenize text
    tokens = preprocess_text(text)  # Make sure you have a preprocess_text function defined
    # Filter tokens based on the model's vocabulary and get embeddings
    embeddings = [model.wv[word] for word in tokens if word in model.wv.key_to_index]
    
    # If at least one token produced an embedding, return the mean vector
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.wv.vector_size)

# Process documents to create 'processed_docs' for Word2Vec training
processed_docs = [preprocess_text(doc) for doc in documents.values()]

# Create the Word2Vec model
model = Word2Vec(processed_docs, vector_size=100, window=5, min_count=1, workers=4)

# Define a function to calculate Normalized Discounted Cumulative Gain (NDCG)
def calculate_ndcg(y_true, y_score, k=5):
    # Sort the true scores in reverse order and calculate ideal DCG
    ideal_sorted_scores = sorted(y_true, reverse=True)[:k]
    ideal_dcg = sum([score / np.log2(idx + 2) for idx, score in enumerate(ideal_sorted_scores)])
    
    # Sort the predicted scores and their corresponding true scores
    scores_pairs = sorted(zip(y_score, y_true), reverse=True)[:k]
    dcg = sum([true / np.log2(idx + 2) for idx, (score, true) in enumerate(scores_pairs)])
    
    # Handle the case when ideal DCG is zero (i.e., no relevant documents)
    ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0
    return ndcg

# Define a function to calculate global NDCG for a given number of documents and queries
def calculate_global_ndcg(num_docs_per_query, num_queries, queries, documents, relevance, model):
    all_ndcg_scores = []
    
    # Shuffle and slice the queries to only process a specific number
    shuffled_query_ids = list(queries.keys())
    np.random.shuffle(shuffled_query_ids)
    selected_query_ids = shuffled_query_ids[:num_queries]

    for query_id in selected_query_ids:
        relevant_docs = relevance.get(query_id, {})
        
        # Get only a specific number of documents per query, selected based on relevance
        selected_docs = sorted(relevant_docs, key=relevant_docs.get, reverse=True)[:num_docs_per_query]
        
        # Preprocess query text
        query_tokens = preprocess_text(queries[query_id])
        # Convert query to embedding
        query_embedding = np.mean([model.wv[token] for token in query_tokens if token in model.wv], axis=0)

        true_scores = []
        predicted_scores = []
        
        for doc_id in selected_docs:
            # Get true relevance score
            true_scores.append(relevant_docs.get(doc_id, 0))
            
            # Preprocess document text
            doc_tokens = preprocess_text(documents[doc_id])
            # Convert document to embedding
            doc_embedding = np.mean([model.wv[token] for token in doc_tokens if token in model.wv], axis=0)
            # Calculate cosine similarity
            cos_sim = cosine_similarity([query_embedding], [doc_embedding])[0][0]
            predicted_scores.append(cos_sim)
        
        # Calculate NDCG score for the query
        ndcg_score = calculate_ndcg(true_scores, predicted_scores, k=len(selected_docs))
        all_ndcg_scores.append(ndcg_score)
    
    # Calculate the global NDCG
    global_ndcg = np.mean(all_ndcg_scores)
    return global_ndcg

# Example usage of the function
global_ndcg_score = calculate_global_ndcg(5, 25, queries, documents, relevance, model)  # Adjust numbers as needed
print(f"Global NDCG score for the specified number of documents and queries: {global_ndcg_score}")

Global NDCG score for the specified number of documents and queries: 0.994326152075219


In [44]:

def calculate_global_ndcg(num_docs_per_query, num_queries, queries, documents, relevance, model):
    all_ndcg_scores = []
    
    # Shuffle and slice the queries to only process a specific number
    shuffled_query_ids = list(queries.keys())
    np.random.shuffle(shuffled_query_ids)
    selected_query_ids = shuffled_query_ids[:num_queries]

    for query_id in selected_query_ids:
        relevant_docs = relevance.get(query_id, {})
        
        # Get only a specific number of documents per query, selected based on relevance
        selected_docs = sorted(relevant_docs, key=relevant_docs.get, reverse=True)[:num_docs_per_query]
        
        # Preprocess query text
        query_tokens = preprocess_text(queries[query_id])
        # Convert query to embedding
        query_embedding = np.mean([model.wv[token] for token in query_tokens if token in model.wv], axis=0)

        true_scores = []
        predicted_scores = []
        
        for doc_id in selected_docs:
            # Get true relevance score
            true_scores.append(relevant_docs.get(doc_id, 0))
            
            # Preprocess document text
            doc_tokens = preprocess_text(documents[doc_id])
            # Convert document to embedding
            doc_embedding = np.mean([model.wv[token] for token in doc_tokens if token in model.wv], axis=0)
            # Calculate cosine similarity
            cos_sim = cosine_similarity([query_embedding], [doc_embedding])[0][0]
            predicted_scores.append(cos_sim)
        
        # Calculate NDCG score for the query
        ndcg_score = calculate_ndcg(true_scores, predicted_scores, k=len(selected_docs))
        all_ndcg_scores.append(ndcg_score)
        
        # Print individual NDCG score for the query
        print(f"NDCG score for query_id {query_id}: {ndcg_score}")
    
    # Calculate the global NDCG
    global_ndcg = np.mean(all_ndcg_scores)
    return global_ndcg, all_ndcg_scores  # Return all scores for further use if needed

# Example usage of the function
global_ndcg_score, all_query_ndcg_scores = calculate_global_ndcg(5,150, queries, documents, relevance, model)  # Adjust numbers as needed
print(f"Global NDCG score for the specified number of documents and queries: {global_ndcg_score}")


NDCG score for query_id PLAIN-1077: 1.0
NDCG score for query_id PLAIN-2699: 1.0
NDCG score for query_id PLAIN-164: 1.0
NDCG score for query_id PLAIN-456: 1.0
NDCG score for query_id PLAIN-2208: 1.0
NDCG score for query_id PLAIN-382: 0.9382995875816248
NDCG score for query_id PLAIN-3311: 1.0
NDCG score for query_id PLAIN-1793: 1.0
NDCG score for query_id PLAIN-2385: 1.0
NDCG score for query_id PLAIN-1556: 1.0
NDCG score for query_id PLAIN-3250: 1.0
NDCG score for query_id PLAIN-3321: 1.0
NDCG score for query_id PLAIN-1804: 1.0
NDCG score for query_id PLAIN-2331: 1.0
NDCG score for query_id PLAIN-1961: 1.0
NDCG score for query_id PLAIN-1689: 1.0
NDCG score for query_id PLAIN-331: 0.8786746795305955
NDCG score for query_id PLAIN-2112: 1.0
NDCG score for query_id PLAIN-1049: 1.0
NDCG score for query_id PLAIN-2363: 1.0
NDCG score for query_id PLAIN-2395: 1.0
NDCG score for query_id PLAIN-680: 1.0
NDCG score for query_id PLAIN-3170: 1.0
NDCG score for query_id PLAIN-955: 1.0
NDCG score for q

In [45]:
from sklearn.metrics import ndcg_score

# Import necessary libraries
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import os
from os.path import isfile, join

# nltk downloads - run these commands if the resources haven't been downloaded yet
# nltk.download('stopwords')
# nltk.download('punkt')
    
# Define a function to load the NFCorpus data from the cloned GitHub repository.
def loadNFCorpus():
    # Define the directory where the data is located.
    dir = "./project1-2023/"
    # Load the document data which contains abstracts from PubMed.
    filename = dir + "dev.docs"
    
    # Initialize a dictionary to store document data.
    dicDoc = {}
    # Read document lines and split them into a dictionary with key as document ID and value as text.
    with open(filename,encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.split('\t')
        key = tabLine[0]
        value = tabLine[1]
        dicDoc[key] = value
    
    # Load and parse the query data similar to document data.
    filename = dir + "dev.all.queries"
    dicReq = {}
    with open(filename, encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.split('\t')
        key = tabLine[0]
        value = tabLine[1]
        dicReq[key] = value
    
    # Load the relevance judgments which provide a relevance score for document-query pairs.
    filename = dir + "dev.2-1-0.qrel"
    dicReqDoc = defaultdict(dict)
    with open(filename) as file:
        lines = file.readlines()
    for line in lines:
        tabLine = line.strip().split('\t')
        req = tabLine[0]
        doc = tabLine[2]
        score = int(tabLine[3])
        dicReqDoc[req][doc] = score
    
    # Return the loaded document and query data along with relevance judgments.
    return dicDoc, dicReq, dicReqDoc

# Load and process NFCorpus data
documents, queries, relevance = loadNFCorpus()

# Define stopwords outside the function to avoid repeated loading
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize and remove stopwords
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2 and word.isalpha()]
    return tokens

def text_to_embedding(text, model):
    # Preprocess and tokenize text
    tokens = preprocess_text(text)  # Make sure you have a preprocess_text function defined
    # Filter tokens based on the model's vocabulary and get embeddings
    embeddings = [model.wv[word] for word in tokens if word in model.wv.key_to_index]
    
    # If at least one token produced an embedding, return the mean vector
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.wv.vector_size)

# Process documents to create 'processed_docs' for Word2Vec training
processed_docs = [preprocess_text(doc) for doc in documents.values()]

# Create the Word2Vec model
model = Word2Vec(processed_docs, vector_size=100, window=5, min_count=1, workers=4)

# Function to convert a collection of text to a matrix of embeddings
def texts_to_embedding_matrix(texts, model):
    # Initialize an empty list to store embeddings
    embedding_matrix = []
    
    # Convert each text to an embedding and append to the list
    for text in texts:
        embedding = text_to_embedding(text, model)
        embedding_matrix.append(embedding)
    
    return np.array(embedding_matrix)

# Use the `text_to_embedding` function to convert queries into embeddings
query_embeddings = {query_id: text_to_embedding(query, model) for query_id, query in queries.items()}

# Use the `texts_to_embedding_matrix` function to convert documents into a matrix of embeddings
document_embeddings = texts_to_embedding_matrix(documents.values(), model)

# Rank documents for each query
ranked_results = {}
for query_id, query_embedding in query_embeddings.items():
    # Compute similarities of the query against all document embeddings
    similarities = cosine_similarity([query_embedding], document_embeddings)[0]
    
    # Rank documents by their similarity scores
    ranked_doc_ids = np.argsort(-similarities)  # negative for descending order
    
    # Store the top 5 document IDs for the query
    ranked_results[query_id] = [list(documents.keys())[index] for index in ranked_doc_ids[:5]]

# Calculate NDCG score
ndcg_scores = {}

# Prepare relevance scores in the order of the ranked results for calculation
true_relevance = []
pred_scores = []
for query_id, ranked_doc_ids in ranked_results.items():
    # Retrieve the true relevance scores for the ranked documents
    true_relevance_query = [relevance[query_id].get(doc_id, 0) for doc_id in ranked_doc_ids]
    true_relevance.append(true_relevance_query)
    
    # Predicted scores are simply the rank order for now, since true relevance is binary in this example
    pred_scores_query = [5-i for i in range(len(ranked_doc_ids))]  # Mock prediction scores (5 to 1)
    pred_scores.append(pred_scores_query)

    # Calculate NDCG for the current query
    ndcg_scores[query_id] = ndcg_score([true_relevance_query], [pred_scores_query], k=5)

# Calculate mean NDCG score
mean_ndcg_score = np.mean(list(ndcg_scores.values()))

# Output the mean NDCG score
print(f"Mean NDCG Score: {mean_ndcg_score}")

# Optionally, print out the ranked results for inspection
for query_id, scores in ndcg_scores.items():
    print(f"Query ID: {query_id}, NDCG Score: {scores}")


Mean NDCG Score: 0.09506297775432213
Query ID: PLAIN-1, NDCG Score: 0.0
Query ID: PLAIN-1007, NDCG Score: 0.38685280723454163
Query ID: PLAIN-101, NDCG Score: 0.0
Query ID: PLAIN-1017, NDCG Score: 0.0
Query ID: PLAIN-1027, NDCG Score: 0.0
Query ID: PLAIN-1038, NDCG Score: 0.0
Query ID: PLAIN-1049, NDCG Score: 0.0
Query ID: PLAIN-1065, NDCG Score: 0.0
Query ID: PLAIN-1077, NDCG Score: 0.0
Query ID: PLAIN-1087, NDCG Score: 0.0
Query ID: PLAIN-1097, NDCG Score: 0.0
Query ID: PLAIN-11, NDCG Score: 0.0
Query ID: PLAIN-1108, NDCG Score: 0.0
Query ID: PLAIN-111, NDCG Score: 0.0
Query ID: PLAIN-1118, NDCG Score: 0.0
Query ID: PLAIN-1129, NDCG Score: 1.0
Query ID: PLAIN-1140, NDCG Score: 0.0
Query ID: PLAIN-1150, NDCG Score: 0.0
Query ID: PLAIN-1160, NDCG Score: 0.0
Query ID: PLAIN-1171, NDCG Score: 0.0
Query ID: PLAIN-1181, NDCG Score: 0.0
Query ID: PLAIN-1192, NDCG Score: 0.0
Query ID: PLAIN-1202, NDCG Score: 0.0
Query ID: PLAIN-1213, NDCG Score: 0.0
Query ID: PLAIN-122, NDCG Score: 0.0
Query

In [46]:
# After the ranking is done, present the results for the specified query IDs
specific_query_ids = ['PLAIN-1836', 'PLAIN-1129', 'PLAIN-1118']

for query_id in specific_query_ids:
    # Print query ID and raw text of the query
    print(f"Query ID: {query_id}")
    print(f"Query Text: {queries[query_id]}")
    
    # Get top 5 ranked document IDs
    top_ranked_doc_ids = ranked_results[query_id]
    
    # Print ranked document texts
    print("Top 5 Ranked Documents by Similarity:")
    for rank, doc_id in enumerate(top_ranked_doc_ids, start=1):
        print(f"{rank}. {doc_id}: {documents[doc_id]}")
    
    # Print actual relevance scores
    print("\nActual Relevance Scores (NFCorpus):")
    actual_relevance_scores = [relevance[query_id].get(doc_id, 0) for doc_id in top_ranked_doc_ids]
    for doc_id, score in zip(top_ranked_doc_ids, actual_relevance_scores):
        print(f"{doc_id}: Relevance Score = {score}")
    
    # Print NDCG score
    ndcg = ndcg_scores[query_id]
    print(f"NDCG Score: {ndcg}\n")

# Calculate mean NDCG score
mean_ndcg_score = np.mean(list(ndcg_scores.values()))
print(f"Mean NDCG Score across all queries: {mean_ndcg_score}")


Query ID: PLAIN-1836
Query Text: persistent organic pollutants food sources associated with the highest levels of persistent organic pollutants : fish ( see here , here , here , here , here , here , here , here ) , fish oil ( see here , here , here , here ) , and chicken ( see here , here ) . there also may be concerning levels in fast food , dairy , creatine supplements , and some ayurvedic medicinal preparations . chemical obesogens ( industrial chemical pollutants ) may play a role in the current obesity epidemic . those eating plant-based diets have been found to be significantly less polluted with industrial toxins than omnivores . - industrial toxins , meat , fish , pcbs , dairy , animal products , children , obesity , eggs , dioxins , animal fat , cancer , reproductive health , milk , pesticides - -

Top 5 Ranked Documents by Similarity:
1. MED-1162: pesticide residues imported organic suspect fruits vegetables pubmed ncbi abstract consumers frequently urged avoid imported foods